scraping and improvements

This commit is contained in:
Stefan Hardegger
2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions

View File

@@ -12,6 +12,9 @@ import ImageUpload from '../../components/ui/ImageUpload';
import { storyApi, authorApi } from '../../lib/api';
export default function AddStoryPage() {
const [importMode, setImportMode] = useState<'manual' | 'url'>('manual');
const [importUrl, setImportUrl] = useState('');
const [scraping, setScraping] = useState(false);
const [formData, setFormData] = useState({
title: '',
summary: '',
@@ -130,6 +133,57 @@ export default function AddStoryPage() {
setFormData(prev => ({ ...prev, tags }));
};
const handleImportFromUrl = async () => {
if (!importUrl.trim()) {
setErrors({ importUrl: 'URL is required' });
return;
}
setScraping(true);
setErrors({});
try {
const response = await fetch('/scrape/story', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ url: importUrl }),
});
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Failed to scrape story');
}
const scrapedStory = await response.json();
// Pre-fill the form with scraped data
setFormData({
title: scrapedStory.title || '',
summary: scrapedStory.summary || '',
authorName: scrapedStory.author || '',
contentHtml: scrapedStory.content || '',
sourceUrl: scrapedStory.sourceUrl || importUrl,
tags: scrapedStory.tags || [],
seriesName: '',
volume: '',
});
// Switch to manual mode so user can edit the pre-filled data
setImportMode('manual');
setImportUrl('');
// Show success message
setErrors({ success: 'Story data imported successfully! Review and edit as needed before saving.' });
} catch (error: any) {
console.error('Failed to import story:', error);
setErrors({ importUrl: error.message });
} finally {
setScraping(false);
}
};
const validateForm = () => {
const newErrors: Record<string, string> = {};
@@ -206,7 +260,105 @@ export default function AddStoryPage() {
</p>
</div>
<form onSubmit={handleSubmit} className="space-y-6">
{/* Import Mode Toggle */}
<div className="mb-8">
<div className="flex border-b border-gray-200 dark:border-gray-700">
<button
type="button"
onClick={() => setImportMode('manual')}
className={`px-6 py-3 text-sm font-medium border-b-2 transition-colors ${
importMode === 'manual'
? 'border-theme-accent text-theme-accent'
: 'border-transparent theme-text hover:text-theme-accent'
}`}
>
Manual Entry
</button>
<button
type="button"
onClick={() => setImportMode('url')}
className={`px-6 py-3 text-sm font-medium border-b-2 transition-colors ${
importMode === 'url'
? 'border-theme-accent text-theme-accent'
: 'border-transparent theme-text hover:text-theme-accent'
}`}
>
Import from URL
</button>
</div>
</div>
{/* URL Import Section */}
{importMode === 'url' && (
<div className="bg-gray-50 dark:bg-gray-800/50 rounded-lg p-6 mb-8">
<h3 className="text-lg font-medium theme-header mb-4">Import Story from URL</h3>
<p className="theme-text text-sm mb-4">
Enter a URL from a supported story site to automatically extract the story content, title, author, and other metadata.
</p>
<div className="space-y-4">
<Input
label="Story URL"
type="url"
value={importUrl}
onChange={(e) => setImportUrl(e.target.value)}
placeholder="https://example.com/story-url"
error={errors.importUrl}
disabled={scraping}
/>
<div className="flex gap-3">
<Button
type="button"
onClick={handleImportFromUrl}
loading={scraping}
disabled={!importUrl.trim() || scraping}
>
{scraping ? 'Importing...' : 'Import Story'}
</Button>
<Button
type="button"
variant="ghost"
onClick={() => setImportMode('manual')}
disabled={scraping}
>
Enter Manually Instead
</Button>
</div>
<div className="border-t pt-4 mt-4">
<p className="text-sm theme-text mb-2">
Need to import multiple stories at once?
</p>
<Button
type="button"
variant="secondary"
onClick={() => router.push('/stories/import/bulk')}
disabled={scraping}
size="sm"
>
Bulk Import Multiple URLs
</Button>
</div>
<div className="text-xs theme-text">
<p className="font-medium mb-1">Supported Sites:</p>
<p>Archive of Our Own, DeviantArt, FanFiction.Net, Literotica, Royal Road, Wattpad, and more</p>
</div>
</div>
</div>
)}
{/* Success Message */}
{errors.success && (
<div className="p-4 bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg mb-6">
<p className="text-green-800 dark:text-green-200">{errors.success}</p>
</div>
)}
{importMode === 'manual' && (
<form onSubmit={handleSubmit} className="space-y-6">
{/* Title */}
<Input
label="Title *"
@@ -379,6 +531,7 @@ export default function AddStoryPage() {
</Button>
</div>
</form>
)}
</div>
</AppLayout>
);

View File

@@ -0,0 +1,72 @@
import { NextRequest, NextResponse } from 'next/server';
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { url } = body;
if (!url || typeof url !== 'string') {
return NextResponse.json(
{ error: 'URL is required and must be a string' },
{ status: 400 }
);
}
// Dynamic import to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const scraper = new StoryScraper();
const stories = await scraper.scrapeAuthorPage(url);
return NextResponse.json({ stories });
} catch (error) {
console.error('Author page scraping error:', error);
// Check if it's a ScraperError without importing at module level
if (error && typeof error === 'object' && error.constructor.name === 'ScraperError') {
return NextResponse.json(
{
error: (error as any).message,
url: (error as any).url
},
{ status: 400 }
);
}
if (error instanceof Error) {
// Handle specific error types
if (error.message.includes('Invalid URL')) {
return NextResponse.json(
{ error: 'Invalid URL provided' },
{ status: 400 }
);
}
if (error.message.includes('not supported')) {
return NextResponse.json(
{ error: 'Author page scraping is not supported for this website' },
{ status: 400 }
);
}
if (error.message.includes('HTTP 404')) {
return NextResponse.json(
{ error: 'Author page not found at the provided URL' },
{ status: 404 }
);
}
if (error.message.includes('timeout')) {
return NextResponse.json(
{ error: 'Request timed out while fetching content' },
{ status: 408 }
);
}
}
return NextResponse.json(
{ error: 'Failed to scrape author page. Please try again.' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,292 @@
import { NextRequest, NextResponse } from 'next/server';
interface BulkImportRequest {
urls: string[];
}
interface ImportResult {
url: string;
status: 'imported' | 'skipped' | 'error';
reason?: string;
title?: string;
author?: string;
error?: string;
storyId?: string;
}
interface BulkImportResponse {
results: ImportResult[];
summary: {
total: number;
imported: number;
skipped: number;
errors: number;
};
}
export async function POST(request: NextRequest) {
try {
// Check for authentication
const authorization = request.headers.get('authorization');
if (!authorization) {
return NextResponse.json(
{ error: 'Authentication required for bulk import' },
{ status: 401 }
);
}
const body = await request.json();
const { urls } = body as BulkImportRequest;
if (!urls || !Array.isArray(urls) || urls.length === 0) {
return NextResponse.json(
{ error: 'URLs array is required and must not be empty' },
{ status: 400 }
);
}
if (urls.length > 50) {
return NextResponse.json(
{ error: 'Maximum 50 URLs allowed per bulk import' },
{ status: 400 }
);
}
// Dynamic imports to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const scraper = new StoryScraper();
const results: ImportResult[] = [];
let importedCount = 0;
let skippedCount = 0;
let errorCount = 0;
console.log(`Starting bulk scraping for ${urls.length} URLs`);
console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
// For server-side API calls in Docker, use direct backend container URL
// Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
const serverSideApiBaseUrl = 'http://backend:8080/api';
console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
// Quick test to verify backend connectivity
try {
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
console.log(`Backend test response status: ${testResponse.status}`);
} catch (error) {
console.error(`Backend connectivity test failed:`, error);
}
for (const url of urls) {
console.log(`Processing URL: ${url}`);
try {
// Validate URL format
if (!url || typeof url !== 'string' || url.trim() === '') {
results.push({
url: url || 'Empty URL',
status: 'error',
error: 'Invalid URL format'
});
errorCount++;
continue;
}
const trimmedUrl = url.trim();
// Scrape the story
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
// Validate required fields
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
const missingFields = [];
if (!scrapedStory.title) missingFields.push('title');
if (!scrapedStory.author) missingFields.push('author');
if (!scrapedStory.content) missingFields.push('content');
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Missing required fields: ${missingFields.join(', ')}`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
// Check for duplicates using query parameters
try {
// Use hardcoded backend URL for container-to-container communication
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
const params = new URLSearchParams({
title: scrapedStory.title,
authorName: scrapedStory.author
});
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (duplicateCheckResponse.ok) {
const duplicateResult = await duplicateCheckResponse.json();
if (duplicateResult.hasDuplicates) {
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Duplicate story found (${duplicateResult.count} existing)`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
}
} catch (error) {
console.warn('Duplicate check failed:', error);
// Continue with import if duplicate check fails
}
// Create the story
try {
const storyData = {
title: scrapedStory.title,
summary: scrapedStory.summary || undefined,
contentHtml: scrapedStory.content,
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
authorName: scrapedStory.author,
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
};
// Use hardcoded backend URL for container-to-container communication
const createUrl = `http://backend:8080/api/stories`;
console.log(`Create story URL: ${createUrl}`);
const createResponse = await fetch(createUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
body: JSON.stringify(storyData),
});
if (!createResponse.ok) {
const errorData = await createResponse.json();
throw new Error(errorData.message || 'Failed to create story');
}
const createdStory = await createResponse.json();
results.push({
url: trimmedUrl,
status: 'imported',
title: scrapedStory.title,
author: scrapedStory.author,
storyId: createdStory.id
});
importedCount++;
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
} catch (error) {
console.error(`Failed to create story for ${trimmedUrl}:`, error);
let errorMessage = 'Failed to create story';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: trimmedUrl,
status: 'error',
error: errorMessage,
title: scrapedStory.title,
author: scrapedStory.author
});
errorCount++;
}
} catch (error) {
console.error(`Error processing URL ${url}:`, error);
let errorMessage = 'Unknown error';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: url,
status: 'error',
error: errorMessage
});
errorCount++;
}
}
const response: BulkImportResponse = {
results,
summary: {
total: urls.length,
imported: importedCount,
skipped: skippedCount,
errors: errorCount
}
};
console.log(`Bulk import completed:`, response.summary);
// Trigger Typesense reindex if any stories were imported
if (importedCount > 0) {
try {
console.log('Triggering Typesense reindex after bulk import...');
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
const reindexResponse = await fetch(reindexUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (reindexResponse.ok) {
const reindexResult = await reindexResponse.json();
console.log('Typesense reindex completed:', reindexResult);
} else {
console.warn('Typesense reindex failed:', reindexResponse.status);
}
} catch (error) {
console.warn('Failed to trigger Typesense reindex:', error);
// Don't fail the whole request if reindex fails
}
}
return NextResponse.json(response);
} catch (error) {
console.error('Bulk import error:', error);
if (error instanceof Error) {
return NextResponse.json(
{ error: `Bulk import failed: ${error.message}` },
{ status: 500 }
);
}
return NextResponse.json(
{ error: 'Bulk import failed due to an unknown error' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,85 @@
import { NextRequest, NextResponse } from 'next/server';
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { url } = body;
if (!url || typeof url !== 'string') {
return NextResponse.json(
{ error: 'URL is required and must be a string' },
{ status: 400 }
);
}
// Dynamic import to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const { ScraperError } = await import('@/lib/scraper/types');
const scraper = new StoryScraper();
const story = await scraper.scrapeStory(url);
// Debug logging
console.log('Scraped story data:', {
url: url,
title: story.title,
author: story.author,
summary: story.summary,
contentLength: story.content?.length || 0,
contentPreview: story.content?.substring(0, 200) + '...',
tags: story.tags,
coverImage: story.coverImage
});
return NextResponse.json(story);
} catch (error) {
console.error('Story scraping error:', error);
// Check if it's a ScraperError without importing at module level
if (error && typeof error === 'object' && error.constructor.name === 'ScraperError') {
return NextResponse.json(
{
error: (error as any).message,
url: (error as any).url
},
{ status: 400 }
);
}
if (error instanceof Error) {
// Handle specific error types
if (error.message.includes('Invalid URL')) {
return NextResponse.json(
{ error: 'Invalid URL provided' },
{ status: 400 }
);
}
if (error.message.includes('Unsupported site')) {
return NextResponse.json(
{ error: 'This website is not supported for scraping' },
{ status: 400 }
);
}
if (error.message.includes('HTTP 404')) {
return NextResponse.json(
{ error: 'Story not found at the provided URL' },
{ status: 404 }
);
}
if (error.message.includes('timeout')) {
return NextResponse.json(
{ error: 'Request timed out while fetching content' },
{ status: 408 }
);
}
}
return NextResponse.json(
{ error: 'Failed to scrape story. Please try again.' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,300 @@
'use client';
import { useState } from 'react';
import { useRouter } from 'next/navigation';
import Link from 'next/link';
import { ArrowLeftIcon } from '@heroicons/react/24/outline';
interface ImportResult {
url: string;
status: 'imported' | 'skipped' | 'error';
reason?: string;
title?: string;
author?: string;
error?: string;
storyId?: string;
}
interface BulkImportResponse {
results: ImportResult[];
summary: {
total: number;
imported: number;
skipped: number;
errors: number;
};
}
export default function BulkImportPage() {
const router = useRouter();
const [urls, setUrls] = useState('');
const [isLoading, setIsLoading] = useState(false);
const [results, setResults] = useState<BulkImportResponse | null>(null);
const [error, setError] = useState<string | null>(null);
const handleSubmit = async (e: React.FormEvent) => {
e.preventDefault();
if (!urls.trim()) {
setError('Please enter at least one URL');
return;
}
setIsLoading(true);
setError(null);
setResults(null);
try {
// Parse URLs from textarea (one per line)
const urlList = urls
.split('\n')
.map(url => url.trim())
.filter(url => url.length > 0);
if (urlList.length === 0) {
setError('Please enter at least one valid URL');
setIsLoading(false);
return;
}
if (urlList.length > 50) {
setError('Maximum 50 URLs allowed per bulk import');
setIsLoading(false);
return;
}
// Get auth token for server-side API calls
const token = localStorage.getItem('auth-token');
const response = await fetch('/scrape/bulk', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': token ? `Bearer ${token}` : '',
},
body: JSON.stringify({ urls: urlList }),
});
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Bulk import failed');
}
const data: BulkImportResponse = await response.json();
setResults(data);
} catch (err) {
console.error('Bulk import error:', err);
setError(err instanceof Error ? err.message : 'Failed to import stories');
} finally {
setIsLoading(false);
}
};
const handleReset = () => {
setUrls('');
setResults(null);
setError(null);
};
const getStatusColor = (status: string) => {
switch (status) {
case 'imported': return 'text-green-700 bg-green-50 border-green-200';
case 'skipped': return 'text-yellow-700 bg-yellow-50 border-yellow-200';
case 'error': return 'text-red-700 bg-red-50 border-red-200';
default: return 'text-gray-700 bg-gray-50 border-gray-200';
}
};
const getStatusIcon = (status: string) => {
switch (status) {
case 'imported': return '✓';
case 'skipped': return '⚠';
case 'error': return '✗';
default: return '';
}
};
return (
<div className="container mx-auto px-4 py-6">
<div className="max-w-4xl mx-auto">
{/* Header */}
<div className="mb-6">
<div className="flex items-center gap-4 mb-4">
<Link
href="/library"
className="inline-flex items-center text-blue-600 hover:text-blue-800"
>
<ArrowLeftIcon className="h-4 w-4 mr-1" />
Back to Library
</Link>
</div>
<h1 className="text-3xl font-bold text-gray-900 mb-2">Bulk Import Stories</h1>
<p className="text-gray-600">
Import multiple stories at once by providing a list of URLs. Each URL will be scraped
and automatically added to your story collection.
</p>
</div>
{!results ? (
// Import Form
<form onSubmit={handleSubmit} className="space-y-6">
<div>
<label htmlFor="urls" className="block text-sm font-medium text-gray-700 mb-2">
Story URLs
</label>
<p className="text-sm text-gray-500 mb-3">
Enter one URL per line. Maximum 50 URLs per import.
</p>
<textarea
id="urls"
value={urls}
onChange={(e) => setUrls(e.target.value)}
placeholder="https://example.com/story1&#10;https://example.com/story2&#10;https://example.com/story3"
className="w-full h-64 px-3 py-2 border border-gray-300 rounded-md shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent"
disabled={isLoading}
/>
<p className="mt-2 text-sm text-gray-500">
URLs: {urls.split('\n').filter(url => url.trim().length > 0).length}
</p>
</div>
{error && (
<div className="bg-red-50 border border-red-200 rounded-md p-4">
<div className="flex">
<div className="ml-3">
<h3 className="text-sm font-medium text-red-800">Error</h3>
<div className="mt-2 text-sm text-red-700">
{error}
</div>
</div>
</div>
</div>
)}
<div className="flex gap-4">
<button
type="submit"
disabled={isLoading || !urls.trim()}
className="px-6 py-2 bg-blue-600 text-white font-medium rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2 disabled:opacity-50 disabled:cursor-not-allowed"
>
{isLoading ? 'Importing...' : 'Start Import'}
</button>
<button
type="button"
onClick={handleReset}
disabled={isLoading}
className="px-6 py-2 bg-gray-600 text-white font-medium rounded-md hover:bg-gray-700 focus:outline-none focus:ring-2 focus:ring-gray-500 focus:ring-offset-2 disabled:opacity-50 disabled:cursor-not-allowed"
>
Clear
</button>
</div>
{isLoading && (
<div className="bg-blue-50 border border-blue-200 rounded-md p-4">
<div className="flex items-center">
<div className="animate-spin rounded-full h-5 w-5 border-b-2 border-blue-600 mr-3"></div>
<div>
<p className="text-sm font-medium text-blue-800">Processing URLs...</p>
<p className="text-sm text-blue-600">
This may take a few minutes depending on the number of URLs and response times of the source websites.
</p>
</div>
</div>
</div>
)}
</form>
) : (
// Results
<div className="space-y-6">
{/* Summary */}
<div className="bg-white border border-gray-200 rounded-lg p-6">
<h2 className="text-xl font-semibold text-gray-900 mb-4">Import Summary</h2>
<div className="grid grid-cols-2 md:grid-cols-4 gap-4">
<div className="text-center">
<div className="text-2xl font-bold text-gray-900">{results.summary.total}</div>
<div className="text-sm text-gray-600">Total URLs</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-green-600">{results.summary.imported}</div>
<div className="text-sm text-gray-600">Imported</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-yellow-600">{results.summary.skipped}</div>
<div className="text-sm text-gray-600">Skipped</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-red-600">{results.summary.errors}</div>
<div className="text-sm text-gray-600">Errors</div>
</div>
</div>
</div>
{/* Detailed Results */}
<div className="bg-white border border-gray-200 rounded-lg">
<div className="px-6 py-4 border-b border-gray-200">
<h3 className="text-lg font-medium text-gray-900">Detailed Results</h3>
</div>
<div className="divide-y divide-gray-200">
{results.results.map((result, index) => (
<div key={index} className="p-6">
<div className="flex items-start justify-between">
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2 mb-2">
<span className={`inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium border ${getStatusColor(result.status)}`}>
{getStatusIcon(result.status)} {result.status.charAt(0).toUpperCase() + result.status.slice(1)}
</span>
</div>
<p className="text-sm text-gray-900 font-medium truncate mb-1">
{result.url}
</p>
{result.title && result.author && (
<p className="text-sm text-gray-600 mb-1">
"{result.title}" by {result.author}
</p>
)}
{result.reason && (
<p className="text-sm text-gray-500">
{result.reason}
</p>
)}
{result.error && (
<p className="text-sm text-red-600">
Error: {result.error}
</p>
)}
</div>
</div>
</div>
))}
</div>
</div>
{/* Actions */}
<div className="flex gap-4">
<button
onClick={handleReset}
className="px-6 py-2 bg-blue-600 text-white font-medium rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2"
>
Import More URLs
</button>
<Link
href="/stories"
className="px-6 py-2 bg-gray-600 text-white font-medium rounded-md hover:bg-gray-700 focus:outline-none focus:ring-2 focus:ring-gray-500 focus:ring-offset-2"
>
View Stories
</Link>
</div>
</div>
)}
</div>
</div>
);
}

View File

@@ -7,6 +7,7 @@ import { useRouter } from 'next/navigation';
import { useAuth } from '../../contexts/AuthContext';
import { useTheme } from '../../lib/theme';
import Button from '../ui/Button';
import Dropdown from '../ui/Dropdown';
export default function Header() {
const [isMenuOpen, setIsMenuOpen] = useState(false);
@@ -14,6 +15,24 @@ export default function Header() {
const { theme, toggleTheme } = useTheme();
const router = useRouter();
const addStoryItems = [
{
href: '/add-story',
label: 'Manual Entry',
description: 'Add a story by manually entering details'
},
{
href: '/stories/import',
label: 'Import from URL',
description: 'Import a single story from a website'
},
{
href: '/stories/import/bulk',
label: 'Bulk Import',
description: 'Import multiple stories from a list of URLs'
}
];
const handleLogout = () => {
logout();
router.push('/login');
@@ -57,12 +76,10 @@ export default function Header() {
>
Authors
</Link>
<Link
href="/add-story"
className="theme-text hover:theme-accent transition-colors font-medium"
>
Add Story
</Link>
<Dropdown
trigger="Add Story"
items={addStoryItems}
/>
</nav>
{/* Right side actions */}
@@ -131,13 +148,32 @@ export default function Header() {
>
Authors
</Link>
<Link
href="/add-story"
className="theme-text hover:theme-accent transition-colors font-medium px-2 py-1"
onClick={() => setIsMenuOpen(false)}
>
Add Story
</Link>
<div className="px-2 py-1">
<div className="font-medium theme-text mb-1">Add Story</div>
<div className="pl-4 space-y-1">
<Link
href="/add-story"
className="block theme-text hover:theme-accent transition-colors text-sm py-1"
onClick={() => setIsMenuOpen(false)}
>
Manual Entry
</Link>
<Link
href="/stories/import"
className="block theme-text hover:theme-accent transition-colors text-sm py-1"
onClick={() => setIsMenuOpen(false)}
>
Import from URL
</Link>
<Link
href="/stories/import/bulk"
className="block theme-text hover:theme-accent transition-colors text-sm py-1"
onClick={() => setIsMenuOpen(false)}
>
Bulk Import
</Link>
</div>
</div>
<Link
href="/settings"
className="theme-text hover:theme-accent transition-colors font-medium px-2 py-1"

View File

@@ -0,0 +1,98 @@
'use client';
import { useState, useRef, useEffect } from 'react';
import Link from 'next/link';
import { ChevronDownIcon } from '@heroicons/react/24/outline';
interface DropdownItem {
href: string;
label: string;
description?: string;
}
interface DropdownProps {
trigger: string;
items: DropdownItem[];
className?: string;
onItemClick?: () => void;
}
export default function Dropdown({ trigger, items, className = '', onItemClick }: DropdownProps) {
const [isOpen, setIsOpen] = useState(false);
const dropdownRef = useRef<HTMLDivElement>(null);
const timeoutRef = useRef<NodeJS.Timeout>();
useEffect(() => {
const handleClickOutside = (event: MouseEvent) => {
if (dropdownRef.current && !dropdownRef.current.contains(event.target as Node)) {
setIsOpen(false);
}
};
if (isOpen) {
document.addEventListener('mousedown', handleClickOutside);
}
return () => {
document.removeEventListener('mousedown', handleClickOutside);
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
}
};
}, [isOpen]);
const handleMouseEnter = () => {
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
}
setIsOpen(true);
};
const handleMouseLeave = () => {
timeoutRef.current = setTimeout(() => {
setIsOpen(false);
}, 150);
};
const handleItemClick = () => {
setIsOpen(false);
onItemClick?.();
};
return (
<div
className={`relative ${className}`}
ref={dropdownRef}
onMouseEnter={handleMouseEnter}
onMouseLeave={handleMouseLeave}
>
<button
onClick={() => setIsOpen(!isOpen)}
className="theme-text hover:theme-accent transition-colors font-medium flex items-center gap-1"
>
{trigger}
<ChevronDownIcon
className={`h-4 w-4 transition-transform duration-200 ${isOpen ? 'rotate-180' : ''}`}
/>
</button>
{isOpen && (
<div className="absolute top-full left-0 mt-1 w-64 theme-card theme-shadow border theme-border rounded-lg py-2 z-50">
{items.map((item, index) => (
<Link
key={index}
href={item.href}
onClick={handleItemClick}
className="block px-4 py-2 theme-text hover:theme-accent transition-colors"
>
<div className="font-medium">{item.label}</div>
{item.description && (
<div className="text-sm theme-text-secondary mt-1">{item.description}</div>
)}
</Link>
))}
</div>
)}
</div>
);
}

View File

@@ -0,0 +1,334 @@
{
"sites": {
"deviantart.com": {
"story": {
"title": "h1",
"titleFallback": "meta[property='og:title']",
"titleFallbackAttribute": "content",
"author": {
"strategy": "text-pattern",
"pattern": "by ([^\\s]+) on DeviantArt",
"searchAfter": "<title>",
"searchBefore": "</title>"
},
"content": {
"strategy": "text-blocks",
"minLength": 200,
"containerHints": ["journal", "literature", "story", "text", "content"],
"excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"]
},
"summary": "meta[property='og:description']",
"summaryAttribute": "content",
"tags": "a[data-tagname]",
"tagsAttribute": "data-tagname",
"coverImage": "meta[property='og:image']",
"coverImageAttribute": "content"
},
"authorPage": {
"storyLinks": "a[data-hook='deviation_link']",
"filterStrategy": "dom-check",
"requiresChildElement": "div[class*='journal']"
}
},
"literotica.com": {
"story": {
"title": "h1",
"titleFallback": "meta[property='og:title']",
"titleFallbackAttribute": "content",
"author": {
"strategy": "link-with-path",
"pathContains": "/authors/",
"searchWithin": "header, .story-info, #story-meta"
},
"content": {
"strategy": "text-blocks",
"minLength": 500,
"containerHints": ["story", "content", "text"],
"excludeSelectors": ["script", "style", "nav", "header", "footer"]
},
"summary": "meta[name='description']",
"summaryAttribute": "content",
"multiPage": {
"enabled": true,
"strategy": "url-pattern",
"pageParam": "page",
"maxPages": 20
}
},
"authorPage": {
"storyLinks": {
"strategy": "href-pattern",
"pattern": "/s/[^/]+$",
"searchWithin": "main, #content, .stories-list"
}
}
},
"mcstories.com": {
"story": {
"title": "title",
"titleTransform": "remove-suffix: - MCStories.com",
"author": "meta[name='dcterms.creator']",
"authorAttribute": "content",
"content": "article#mcstories",
"summary": "meta[name='dcterms.description']",
"summaryAttribute": "content"
},
"authorPage": {
"storyLinks": "a[href$='.html']:not([href*='Authors'])",
"linkPrefix": "https://mcstories.com/"
}
},
"docs-lab.com": {
"story": {
"title": "title",
"titleTransform": "remove-suffix: - Doc's Lab",
"author": "a[href*='/profiles/'] strong",
"content": {
"strategy": "html-between",
"startMarker": "<h2>Story</h2>",
"endMarker": "</div>",
"includeStart": false
},
"tags": "span.label"
},
"authorPage": {
"storyLinks": "a[href*='/submissions/']",
"linkPrefix": "https://docs-lab.com"
}
},
"archiveofourown.org": {
"story": {
"title": "h2.title",
"author": "a[rel='author']",
"content": {
"strategy": "chapters",
"chapterSelector": "div.userstuff[role='article']",
"chaptersWrapper": "#chapters",
"singleChapter": "#workskin"
},
"summary": "div.summary blockquote.userstuff",
"tags": {
"strategy": "multiple-types",
"selectors": {
"fandom": "dd.fandom a.tag",
"warning": "dd.warning a.tag",
"category": "dd.category a.tag",
"relationship": "dd.relationship a.tag",
"character": "dd.character a.tag",
"freeform": "dd.freeform a.tag"
}
},
"multiPage": {
"enabled": true,
"strategy": "chapter-navigation",
"chapterListSelector": "#chapter_index option",
"urlPattern": "/chapters/{chapterId}"
}
},
"authorPage": {
"storyLinks": "h4.heading a[href*='/works/']",
"pagination": {
"enabled": true,
"nextPageSelector": "li.next a[rel='next']"
}
}
},
"fanfiction.net": {
"story": {
"title": "#profile_top b.xcontrast_txt",
"author": "#profile_top a[href*='/u/']",
"content": "#storytext",
"summary": "#profile_top div.xcontrast_txt",
"coverImage": {
"strategy": "lazy-loaded",
"selector": "img.cimage",
"attribute": "data-original"
},
"multiPage": {
"enabled": true,
"strategy": "chapter-dropdown",
"chapterSelector": "select#chap_select option",
"urlPattern": "{baseUrl}/{chapterNumber}"
}
},
"authorPage": {
"storyLinks": "div.z-list a.stitle",
"metadata": {
"strategy": "sibling-text",
"metadataSelector": "div.z-padtop2",
"parsePattern": "Rated: ([^-]+) - .+ - Chapters: (\\d+)"
}
}
},
"royalroad.com": {
"story": {
"title": "h1[property='name']",
"author": "h4[property='author'] a",
"content": {
"strategy": "chapter-content",
"selector": "div.chapter-content",
"cleanupSelectors": [".portlet", ".ads-holder", "div[style*='display:none']"]
},
"summary": "div.description div.hidden-content",
"tags": "span.tags a.fiction-tag",
"coverImage": "img.thumbnail",
"coverImageAttribute": "src",
"multiPage": {
"enabled": true,
"strategy": "table-of-contents",
"tocSelector": "table#chapters tbody tr a[href*='/chapter/']",
"requiresAuth": false
}
},
"authorPage": {
"storyLinks": "div.fiction-list-item h2.fiction-title a",
"additionalInfo": {
"strategy": "data-attributes",
"statsSelector": "div.stats",
"extractStats": ["pages", "followers", "views"]
}
}
},
"wattpad.com": {
"story": {
"title": "h1",
"author": {
"strategy": "schema-org",
"schemaType": "Person",
"property": "name",
"fallbackSelector": "a[href*='/user/']"
},
"content": {
"strategy": "react-content",
"contentClass": "pre-wrap",
"paragraphSelector": "p[data-p-id]",
"requiresJavaScript": true
},
"summary": "h2.description",
"tags": "div.tag-items a.tag",
"coverImage": {
"strategy": "responsive-image",
"selector": "img[alt*='cover']",
"srcsetAttribute": "srcset",
"selectLargest": true
},
"multiPage": {
"enabled": true,
"strategy": "api-based",
"apiPattern": "/v4/parts/{partId}/text",
"tocApiPattern": "/v5/stories/{storyId}/parts",
"requiresAuth": true
}
},
"authorPage": {
"storyLinks": {
"strategy": "infinite-scroll",
"initialSelector": "a[href*='/story/']",
"apiEndpoint": "/v4/users/{userId}/stories",
"requiresJavaScript": true
}
}
}
},
"strategies": {
"text-blocks": {
"description": "Find content by looking for large text blocks",
"implementation": "Find all text nodes, group by parent, select parent with most text"
},
"link-with-path": {
"description": "Find links containing specific path patterns",
"implementation": "querySelector with href*= or iterate and check .href property"
},
"href-pattern": {
"description": "Match links by regex pattern",
"implementation": "Array.from(links).filter(a => pattern.test(a.href))"
},
"text-pattern": {
"description": "Extract text using regex from raw HTML",
"implementation": "Use regex on .html() with proper groups"
},
"html-between": {
"description": "Extract HTML between markers",
"implementation": "indexOf() to find positions, substring to extract"
},
"chapters": {
"description": "Extract story content that may be in chapters",
"implementation": "Check for multiple chapters or single chapter format"
},
"multiple-types": {
"description": "Extract different categories of tags",
"implementation": "Map over selector types and extract each category"
},
"chapter-navigation": {
"description": "Navigate through chapters using chapter index",
"implementation": "Extract chapter IDs and construct URLs"
},
"lazy-loaded": {
"description": "Extract images that are lazy-loaded",
"implementation": "Check data-* attributes for actual image source"
},
"chapter-dropdown": {
"description": "Handle stories with chapter selection dropdown",
"implementation": "Parse dropdown options and construct chapter URLs"
},
"table-of-contents": {
"description": "Extract chapters from a table of contents",
"implementation": "Find all chapter links in TOC structure"
},
"schema-org": {
"description": "Extract data from schema.org structured data",
"implementation": "Parse JSON-LD or microdata for specific properties"
},
"react-content": {
"description": "Extract content from React-rendered pages",
"implementation": "May require JavaScript execution or API access"
},
"responsive-image": {
"description": "Select best quality from responsive images",
"implementation": "Parse srcset and select highest resolution"
},
"api-based": {
"description": "Use API endpoints instead of HTML scraping",
"implementation": "Detect API patterns and make direct API calls"
},
"infinite-scroll": {
"description": "Handle pages with infinite scroll",
"implementation": "Detect scroll API endpoints or pagination"
}
},
"globalOptions": {
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"timeout": 30000,
"retryAttempts": 3,
"rateLimitMs": 1000,
"cacheDuration": 300000,
"javascriptTimeout": 10000
},
"siteNotes": {
"wattpad.com": {
"warning": "Wattpad has aggressive anti-scraping measures. Consider using their API if available.",
"requiresAuth": "Some stories may require login to access full content"
},
"royalroad.com": {
"note": "Very scraper-friendly with good HTML structure"
},
"archiveofourown.org": {
"note": "Respects robots.txt, has good semantic HTML",
"rateLimit": "Be extra respectful of rate limits"
},
"fanfiction.net": {
"note": "Older site with simpler HTML structure",
"warning": "Known to block IPs for aggressive scraping"
}
}
}

View File

@@ -0,0 +1,379 @@
import 'server-only';
// Note: cheerio import is done dynamically to avoid client-side bundling issues
// Using any type for CheerioAPI to prevent bundling issues
import {
SitesConfig,
SiteConfig,
ScrapedStory,
ScrapedAuthorStory,
SelectorStrategy,
MultiPageConfig,
ScraperError
} from './types';
import { RateLimiter } from './utils/rateLimit';
import { ScraperCache } from './utils/cache';
import { UrlParser } from './utils/urlParser';
import {
extractByTextPattern,
extractTextBlocks,
extractHtmlBetween,
extractLinkText,
extractLinkWithPath,
extractHrefPattern,
extractFirstImage,
extractResponsiveImage,
extractLazyLoadedImage,
extractChapters,
extractChapterContent,
extractMultipleTypes,
extractSchemaOrg,
extractReactContent,
cleanHtml,
extractAttribute
} from './strategies';
import sitesConfig from './config/sites.json';
export class StoryScraper {
private config: SitesConfig;
private cache: ScraperCache;
private rateLimiter: RateLimiter;
constructor() {
this.config = sitesConfig as SitesConfig;
this.cache = new ScraperCache(this.config.globalOptions.cacheDuration);
this.rateLimiter = new RateLimiter(this.config.globalOptions.rateLimitMs);
}
async scrapeStory(url: string): Promise<ScrapedStory> {
try {
if (!UrlParser.validateUrl(url)) {
throw new Error(`Invalid URL: ${url}`);
}
const domain = UrlParser.getDomain(url);
const siteConfig = this.config.sites[domain];
if (!siteConfig) {
throw new Error(`Unsupported site: ${domain}`);
}
const html = await this.fetchWithCache(url);
const cheerio = await import('cheerio');
const $ = cheerio.load(html);
const story: ScrapedStory = {
title: await this.extractFieldWithFallback($, siteConfig.story, 'title', html),
author: await this.extractFieldWithFallback($, siteConfig.story, 'author', html),
content: await this.extractContent($, siteConfig.story, url, html),
sourceUrl: url
};
// Extract optional fields
if (siteConfig.story.summary) {
story.summary = await this.extractField($, siteConfig.story.summary, html, siteConfig.story.summaryAttribute);
}
if (siteConfig.story.coverImage) {
story.coverImage = await this.extractField($, siteConfig.story.coverImage, html, siteConfig.story.coverImageAttribute);
}
if (siteConfig.story.tags) {
const tagsResult = await this.extractTags($, siteConfig.story.tags, html, siteConfig.story.tagsAttribute);
if (Array.isArray(tagsResult)) {
story.tags = tagsResult;
} else if (typeof tagsResult === 'string' && tagsResult) {
story.tags = [tagsResult];
}
}
// Apply post-processing
story.title = this.applyTransforms(story.title, siteConfig.story.titleTransform);
story.content = await cleanHtml(story.content);
return story;
} catch (error) {
if (error instanceof Error) {
throw new ScraperError(
`Failed to scrape ${url}: ${error.message}`,
url,
error
);
}
throw error;
}
}
async scrapeAuthorPage(url: string): Promise<ScrapedAuthorStory[]> {
try {
if (!UrlParser.validateUrl(url)) {
throw new Error(`Invalid URL: ${url}`);
}
const domain = UrlParser.getDomain(url);
const siteConfig = this.config.sites[domain];
if (!siteConfig || !siteConfig.authorPage) {
throw new Error(`Author page scraping not supported for: ${domain}`);
}
const html = await this.fetchWithCache(url);
const cheerio = await import('cheerio');
const $ = cheerio.load(html);
const storyLinks = await this.extractField($, siteConfig.authorPage.storyLinks, html);
const stories: ScrapedAuthorStory[] = [];
if (Array.isArray(storyLinks)) {
for (const link of storyLinks) {
const storyUrl = UrlParser.normalizeUrl(link, url);
try {
const scrapedStory = await this.scrapeStory(storyUrl);
stories.push({
url: storyUrl,
title: scrapedStory.title,
author: scrapedStory.author,
summary: scrapedStory.summary
});
} catch (error) {
console.warn(`Failed to scrape story ${storyUrl}:`, error);
}
}
}
return stories;
} catch (error) {
if (error instanceof Error) {
throw new ScraperError(
`Failed to scrape author page ${url}: ${error.message}`,
url,
error
);
}
throw error;
}
}
private async extractFieldWithFallback(
$: any,
config: any,
fieldName: string,
html: string
): Promise<string> {
const primarySelector = config[fieldName];
const fallbackSelector = config[`${fieldName}Fallback`];
const attribute = config[`${fieldName}Attribute`];
const fallbackAttribute = config[`${fieldName}FallbackAttribute`];
// Try primary selector first
if (primarySelector) {
const result = await this.extractField($, primarySelector, html, attribute);
if (result && result.trim()) {
return result.trim();
}
}
// Try fallback selector if primary failed
if (fallbackSelector) {
const result = await this.extractField($, fallbackSelector, html, fallbackAttribute);
if (result && result.trim()) {
return result.trim();
}
}
return '';
}
private async extractField(
$: any,
selector: string | SelectorStrategy,
html: string,
attribute?: string
): Promise<any> {
if (typeof selector === 'string') {
// Simple CSS selector - always return single value (first element)
const element = $(selector).first();
if (attribute) {
// Extract specific attribute instead of text
return element.attr(attribute) || '';
}
return element.text().trim();
}
// Strategy-based extraction
return await this.executeStrategy($, selector, html);
}
private async extractTags(
$: any,
selector: string | SelectorStrategy,
html: string,
attribute?: string
): Promise<any> {
if (typeof selector === 'string') {
// Simple CSS selector - collect ALL matching elements for tags
const elements = $(selector);
if (elements.length === 0) {
return [];
}
const results: string[] = [];
elements.each((_: any, elem: any) => {
const $elem = $(elem);
const value = attribute ? $elem.attr(attribute) : $elem.text().trim();
if (value) {
results.push(value);
}
});
return results;
}
// Strategy-based extraction for tags
return await this.executeStrategy($, selector, html);
}
private async executeStrategy(
$: any,
strategy: SelectorStrategy,
html: string
): Promise<any> {
switch (strategy.strategy) {
case 'text-pattern':
return extractByTextPattern(html, strategy as any);
case 'link-with-path':
return extractLinkWithPath($, strategy as any);
case 'text-blocks':
return extractTextBlocks($, strategy as any);
case 'href-pattern':
return extractHrefPattern($, strategy as any);
case 'html-between':
return extractHtmlBetween(html, strategy as any);
case 'link-text':
return extractLinkText($, strategy as any);
case 'first-image':
return extractFirstImage($, strategy as any);
case 'responsive-image':
return extractResponsiveImage($, strategy as any);
case 'lazy-loaded':
return extractLazyLoadedImage($, strategy as any);
case 'chapters':
return extractChapters($, strategy as any);
case 'chapter-content':
return extractChapterContent($, strategy as any);
case 'multiple-types':
return extractMultipleTypes($, strategy as any);
case 'schema-org':
return extractSchemaOrg($, strategy as any);
case 'react-content':
return extractReactContent($, strategy as any);
default:
throw new Error(`Unknown strategy: ${strategy.strategy}`);
}
}
private async extractContent(
$: any,
storyConfig: any,
url: string,
html: string
): Promise<string> {
let content = await this.extractField($, storyConfig.content, html);
if (storyConfig.multiPage?.enabled) {
const additionalPages = await this.fetchAdditionalPages(
$,
url,
storyConfig.multiPage
);
for (const pageHtml of additionalPages) {
const cheerioPage = await import('cheerio');
const $page = cheerioPage.load(pageHtml);
const pageContent = await this.extractField(
$page,
storyConfig.content,
pageHtml
);
content += '\n\n' + pageContent;
}
}
return content;
}
private async fetchAdditionalPages(
$: any,
baseUrl: string,
config: MultiPageConfig
): Promise<string[]> {
const pages: string[] = [];
let currentUrl = baseUrl;
let pageNum = 2;
while (pageNum <= (config.maxPages || 20)) {
let nextUrl: string | null = null;
if (config.strategy === 'url-pattern') {
nextUrl = UrlParser.buildPageUrl(baseUrl, pageNum, config);
} else if (config.nextPageSelector) {
const nextLink = $(config.nextPageSelector).attr('href');
if (nextLink) {
nextUrl = UrlParser.normalizeUrl(nextLink, currentUrl);
}
}
if (!nextUrl) break;
try {
await this.rateLimiter.throttle();
const html = await this.fetchWithCache(nextUrl);
pages.push(html);
currentUrl = nextUrl;
pageNum++;
} catch (error) {
console.error(`Failed to fetch page ${pageNum}:`, error);
break;
}
}
return pages;
}
private async fetchWithCache(url: string): Promise<string> {
const cached = this.cache.get(url);
if (cached) {
return cached;
}
await this.rateLimiter.throttle();
const response = await fetch(url, {
headers: {
'User-Agent': this.config.globalOptions.userAgent,
},
signal: AbortSignal.timeout(this.config.globalOptions.timeout)
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const html = await response.text();
this.cache.set(url, html);
return html;
}
private applyTransforms(text: string, transform?: string): string {
if (!transform) return text;
if (transform.startsWith('remove-suffix:')) {
const suffix = transform.substring('remove-suffix:'.length).trim();
return text.replace(new RegExp(`${suffix}$`, 'i'), '').trim();
}
return text;
}
}

View File

@@ -0,0 +1,164 @@
// Dynamic cheerio import used to avoid client-side bundling issues
// Using any type for CheerioAPI to prevent bundling issues
import {
ChaptersStrategy,
ChapterContentStrategy,
MultipleTypesStrategy,
SchemaOrgStrategy,
ReactContentStrategy
} from '../types';
export function extractChapters(
$: any,
config: ChaptersStrategy
): string {
// Check for multiple chapters first
if (config.chaptersWrapper) {
const chaptersWrapper = $(config.chaptersWrapper);
if (chaptersWrapper.length > 0) {
const chapters = chaptersWrapper.find(config.chapterSelector);
if (chapters.length > 1) {
// Multiple chapters - combine them
let content = '';
chapters.each((_: any, elem: any) => {
content += $(elem).html() + '\n\n';
});
return content.trim();
}
}
}
// Single chapter fallback
if (config.singleChapter) {
const singleChapter = $(config.singleChapter);
if (singleChapter.length > 0) {
return singleChapter.html() || '';
}
}
// Direct chapter selector fallback
const chapter = $(config.chapterSelector).first();
return chapter.html() || '';
}
export function extractChapterContent(
$: any,
config: ChapterContentStrategy
): string {
const content = $(config.selector);
// Remove cleanup selectors
if (config.cleanupSelectors) {
config.cleanupSelectors.forEach(selector => {
content.find(selector).remove();
});
}
return content.html() || '';
}
export function extractMultipleTypes(
$: any,
config: MultipleTypesStrategy
): string[] {
const tags: string[] = [];
Object.entries(config.selectors).forEach(([type, selector]) => {
$(selector).each((_: any, elem: any) => {
const tag = $(elem).text().trim();
if (tag) {
tags.push(`${type}: ${tag}`);
}
});
});
return tags;
}
export function extractSchemaOrg(
$: any,
config: SchemaOrgStrategy
): string {
// Look for JSON-LD first
$('script[type="application/ld+json"]').each((_: any, elem: any) => {
try {
const data = JSON.parse($(elem).html() || '');
if (data['@type'] === config.schemaType ||
(Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
const item = Array.isArray(data) ?
data.find(item => item['@type'] === config.schemaType) : data;
if (item && item[config.property]) {
return item[config.property];
}
}
} catch (e) {
// Invalid JSON, continue
}
});
// Fallback to selector
if (config.fallbackSelector) {
return $(config.fallbackSelector).first().text().trim();
}
return '';
}
export function extractReactContent(
$: any,
config: ReactContentStrategy
): string {
// This is a simplified version - full React content extraction
// would require JavaScript execution or API access
const contentElements = $(config.paragraphSelector);
let content = '';
contentElements.each((_: any, elem: any) => {
const $elem = $(elem);
if ($elem.hasClass(config.contentClass)) {
content += $elem.html() + '\n\n';
}
});
return content.trim();
}
export async function cleanHtml(html: string): Promise<string> {
// Basic HTML cleaning - remove scripts, styles, and dangerous elements
const cheerio = await import('cheerio');
const $ = cheerio.load(html, {
// Preserve self-closing tags like <br>
xmlMode: false,
decodeEntities: false
});
// Remove dangerous elements
$('script, style, iframe, embed, object').remove();
// Remove empty paragraphs and divs (but preserve <br> tags)
$('p:empty, div:empty').not(':has(br)').remove();
// Clean up excessive whitespace in text nodes only, preserve <br> tags
$('*').each((_, elem) => {
const $elem = $(elem);
if (elem.type === 'text') {
const text = $elem.text();
if (text && text.trim() !== text) {
$elem.replaceWith(text.trim());
}
}
});
// Return HTML with proper self-closing tag format
return $.html() || '';
}
export function extractAttribute(
$: any,
selector: string,
attribute: string
): string {
const element = $(selector).first();
return element.attr(attribute) || '';
}

View File

@@ -0,0 +1,3 @@
export * from './textExtractor';
export * from './linkExtractor';
export * from './contentCleaner';

View File

@@ -0,0 +1,98 @@
// Dynamic cheerio import used to avoid client-side bundling issues
// Using any type for CheerioAPI to prevent bundling issues
import {
LinkWithPathStrategy,
HrefPatternStrategy,
FirstImageStrategy,
ResponsiveImageStrategy,
LazyLoadedStrategy
} from '../types';
export function extractLinkWithPath(
$: any,
config: LinkWithPathStrategy
): string {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
const links = searchScope.find('a');
for (let i = 0; i < links.length; i++) {
const link = links.eq(i);
const href = link.attr('href');
if (href && href.includes(config.pathContains)) {
return link.text().trim();
}
}
return '';
}
export function extractHrefPattern(
$: any,
config: HrefPatternStrategy
): string[] {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
const pattern = new RegExp(config.pattern);
const links: string[] = [];
searchScope.find('a').each((_: any, elem: any) => {
const href = $(elem).attr('href');
if (href && pattern.test(href)) {
links.push(href);
}
});
return links;
}
export function extractFirstImage(
$: any,
config: FirstImageStrategy
): string {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
const img = searchScope.find('img').first();
return img.attr(config.attribute) || '';
}
export function extractResponsiveImage(
$: any,
config: ResponsiveImageStrategy
): string {
const img = $(config.selector).first();
if (config.selectLargest && config.srcsetAttribute) {
const srcset = img.attr(config.srcsetAttribute);
if (srcset) {
// Parse srcset and return the largest image
const sources = srcset.split(',').map((src: string) => {
const parts = src.trim().split(' ');
const url = parts[0];
const descriptor = parts[1] || '1x';
const width = descriptor.includes('w') ?
parseInt(descriptor.replace('w', '')) :
descriptor.includes('x') ?
parseInt(descriptor.replace('x', '')) * 100 : 100;
return { url, width };
});
const largest = sources.reduce((prev: any, current: any) =>
prev.width > current.width ? prev : current
);
return largest.url;
}
}
return img.attr('src') || '';
}
export function extractLazyLoadedImage(
$: any,
config: LazyLoadedStrategy
): string {
const img = $(config.selector).first();
return img.attr(config.attribute) || img.attr('src') || '';
}

View File

@@ -0,0 +1,144 @@
import 'server-only';
// Dynamic cheerio import used to avoid client-side bundling issues
// Using any type for CheerioAPI to prevent bundling issues
import {
TextPatternStrategy,
TextBlockStrategy,
HtmlBetweenStrategy,
LinkTextStrategy
} from '../types';
export function extractByTextPattern(
html: string,
config: TextPatternStrategy
): string {
let searchContent = html;
// Limit search scope if specified
if (config.searchAfter) {
const afterIndex = html.indexOf(config.searchAfter);
if (afterIndex !== -1) {
searchContent = html.substring(afterIndex);
}
}
if (config.searchBefore) {
const beforeIndex = searchContent.indexOf(config.searchBefore);
if (beforeIndex !== -1) {
searchContent = searchContent.substring(0, beforeIndex);
}
}
const regex = new RegExp(config.pattern, 'i');
const match = searchContent.match(regex);
return match ? match[config.group || 1].trim() : '';
}
export function extractTextBlocks(
$: any,
config: TextBlockStrategy
): string {
const blocks: Array<{element: any, text: string}> = [];
// Remove excluded elements first
if (config.excludeSelectors) {
config.excludeSelectors.forEach(selector => {
$(selector).remove();
});
}
$('*').each((_: any, elem: any) => {
const $elem = $(elem);
const text = $elem.clone().children().remove().end().text().trim();
if (text.length >= (config.minLength || 500)) {
blocks.push({ element: elem, text });
}
});
// Find the block that likely contains story content
const storyBlock = blocks.find(block => {
if (config.containerHints && config.containerHints.length > 0) {
const hasHints = config.containerHints.some(hint =>
$(block.element).attr('class')?.includes(hint) ||
$(block.element).attr('id')?.includes(hint)
);
return hasHints;
}
return blocks.length === 1;
});
if (storyBlock) {
return $(storyBlock.element).html() || '';
}
// Fallback to largest block
const largestBlock = blocks.reduce((prev, current) =>
prev.text.length > current.text.length ? prev : current
);
return largestBlock ? $(largestBlock.element).html() || '' : '';
}
export function extractHtmlBetween(
html: string,
config: HtmlBetweenStrategy
): string {
const startIndex = html.indexOf(config.startMarker);
if (startIndex === -1) return '';
const contentStart = config.includeStart ?
startIndex :
startIndex + config.startMarker.length;
const endIndex = html.indexOf(config.endMarker, contentStart);
if (endIndex === -1) {
return html.substring(contentStart);
}
return html.substring(contentStart, endIndex).trim();
}
export function extractLinkText(
$: any,
config: LinkTextStrategy
): string {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
// Look for links near the specified text patterns
let foundText = '';
config.nearText.forEach(text => {
if (foundText) return; // Already found
searchScope.find('*').each((_: any, elem: any) => {
const $elem = $(elem);
const elemText = $elem.text().toLowerCase();
if (elemText.includes(text.toLowerCase())) {
// Look for nearby links
const $link = $elem.find('a').first();
if ($link.length) {
foundText = $link.text().trim();
return false; // Break out of each
}
// Check if the element itself is a link
if ($elem.is('a')) {
foundText = $elem.text().trim();
return false;
}
// Look for links in the next few siblings
const $siblings = $elem.nextAll().slice(0, 3);
$siblings.find('a').first().each((_: any, link: any) => {
foundText = $(link).text().trim();
return false;
});
}
});
});
return foundText;
}

View File

@@ -0,0 +1,248 @@
export interface SiteConfig {
story: StorySelectors;
authorPage: AuthorPageSelectors;
}
export interface StorySelectors {
title: string | SelectorStrategy;
author: string | SelectorStrategy;
content: string | SelectorStrategy;
summary?: string | SelectorStrategy;
coverImage?: string | SelectorStrategy;
tags?: string | SelectorStrategy;
multiPage?: MultiPageConfig;
titleFallback?: string;
titleFallbackAttribute?: string;
titleTransform?: string;
summaryAttribute?: string;
coverImageAttribute?: string;
tagsAttribute?: string;
}
export interface AuthorPageSelectors {
storyLinks: string | SelectorStrategy;
pagination?: PaginationConfig;
linkPrefix?: string;
filterStrategy?: string;
requiresChildElement?: string;
requiresNavigation?: NavigationConfig;
metadata?: MetadataConfig;
additionalInfo?: AdditionalInfoConfig;
}
export interface SelectorStrategy {
strategy: string;
[key: string]: any;
}
export interface MultiPageConfig {
enabled: boolean;
strategy: 'url-pattern' | 'next-link' | 'chapter-navigation' | 'chapter-dropdown' | 'table-of-contents' | 'api-based';
nextPageSelector?: string;
pageParam?: string;
maxPages?: number;
chapterListSelector?: string;
chapterSelector?: string;
urlPattern?: string;
tocSelector?: string;
requiresAuth?: boolean;
apiPattern?: string;
tocApiPattern?: string;
}
export interface PaginationConfig {
enabled: boolean;
nextPageSelector: string;
}
export interface NavigationConfig {
enabled: boolean;
clickText: string;
waitMs: number;
}
export interface MetadataConfig {
strategy: string;
metadataSelector: string;
parsePattern: string;
}
export interface AdditionalInfoConfig {
strategy: string;
statsSelector: string;
extractStats: string[];
}
export interface ScrapedStory {
title: string;
author: string;
content: string;
summary?: string;
coverImage?: string;
tags?: string[];
sourceUrl: string;
}
export interface ScrapedAuthorStory {
url: string;
title: string;
author: string;
summary?: string;
}
export interface SitesConfig {
sites: Record<string, SiteConfig>;
strategies: Record<string, StrategyDescription>;
globalOptions: GlobalOptions;
siteNotes?: Record<string, SiteNotes>;
}
export interface StrategyDescription {
description: string;
implementation: string;
}
export interface GlobalOptions {
userAgent: string;
timeout: number;
retryAttempts: number;
rateLimitMs: number;
cacheDuration?: number;
javascriptTimeout?: number;
}
export interface SiteNotes {
warning?: string;
note?: string;
rateLimit?: string;
requiresAuth?: string;
}
// Strategy-specific interfaces
export interface TextPatternStrategy extends SelectorStrategy {
strategy: 'text-pattern';
pattern: string;
group?: number;
searchAfter?: string;
searchBefore?: string;
}
export interface LinkWithPathStrategy extends SelectorStrategy {
strategy: 'link-with-path';
pathContains: string;
searchWithin?: string;
}
export interface TextBlockStrategy extends SelectorStrategy {
strategy: 'text-blocks';
minLength?: number;
containerHints?: string[];
excludeSelectors?: string[];
}
export interface HrefPatternStrategy extends SelectorStrategy {
strategy: 'href-pattern';
pattern: string;
searchWithin?: string;
}
export interface HtmlBetweenStrategy extends SelectorStrategy {
strategy: 'html-between';
startMarker: string;
endMarker: string;
includeStart?: boolean;
}
export interface ChaptersStrategy extends SelectorStrategy {
strategy: 'chapters';
chapterSelector: string;
chaptersWrapper?: string;
singleChapter?: string;
}
export interface MultipleTypesStrategy extends SelectorStrategy {
strategy: 'multiple-types';
selectors: Record<string, string>;
}
export interface LinkTextStrategy extends SelectorStrategy {
strategy: 'link-text';
nearText: string[];
searchWithin?: string;
}
export interface FirstImageStrategy extends SelectorStrategy {
strategy: 'first-image';
searchWithin: string;
attribute: string;
}
export interface SchemaOrgStrategy extends SelectorStrategy {
strategy: 'schema-org';
schemaType: string;
property: string;
fallbackSelector?: string;
}
export interface ReactContentStrategy extends SelectorStrategy {
strategy: 'react-content';
contentClass: string;
paragraphSelector: string;
requiresJavaScript: boolean;
}
export interface ResponsiveImageStrategy extends SelectorStrategy {
strategy: 'responsive-image';
selector: string;
srcsetAttribute: string;
selectLargest: boolean;
}
export interface LazyLoadedStrategy extends SelectorStrategy {
strategy: 'lazy-loaded';
selector: string;
attribute: string;
}
export interface ChapterContentStrategy extends SelectorStrategy {
strategy: 'chapter-content';
selector: string;
cleanupSelectors?: string[];
}
export interface DataAttributesStrategy extends SelectorStrategy {
strategy: 'data-attributes';
statsSelector: string;
extractStats: string[];
}
export interface SiblingTextStrategy extends SelectorStrategy {
strategy: 'sibling-text';
metadataSelector: string;
parsePattern: string;
}
export interface ApiBasedStrategy extends SelectorStrategy {
strategy: 'api-based';
apiPattern: string;
tocApiPattern?: string;
requiresAuth: boolean;
}
export interface InfiniteScrollStrategy extends SelectorStrategy {
strategy: 'infinite-scroll';
initialSelector: string;
apiEndpoint: string;
requiresJavaScript: boolean;
}
export class ScraperError extends Error {
constructor(
message: string,
public url: string,
public originalError?: Error
) {
super(message);
this.name = 'ScraperError';
}
}

View File

@@ -0,0 +1,35 @@
export class ScraperCache {
private cache: Map<string, { data: any; timestamp: number }> = new Map();
private ttl: number;
constructor(ttlMs: number = 300000) { // 5 minutes default
this.ttl = ttlMs;
}
get(key: string): any | null {
const entry = this.cache.get(key);
if (!entry) return null;
if (Date.now() - entry.timestamp > this.ttl) {
this.cache.delete(key);
return null;
}
return entry.data;
}
set(key: string, data: any): void {
this.cache.set(key, {
data,
timestamp: Date.now()
});
}
clear(): void {
this.cache.clear();
}
size(): number {
return this.cache.size;
}
}

View File

@@ -0,0 +1,23 @@
export class RateLimiter {
private lastRequest: number = 0;
private minDelay: number;
constructor(minDelayMs: number = 1000) {
this.minDelay = minDelayMs;
}
async throttle(): Promise<void> {
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequest;
if (timeSinceLastRequest < this.minDelay) {
await this.delay(this.minDelay - timeSinceLastRequest);
}
this.lastRequest = Date.now();
}
private delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
}

View File

@@ -0,0 +1,61 @@
export class UrlParser {
static getDomain(url: string): string {
try {
const urlObj = new URL(url);
return urlObj.hostname.replace(/^www\./, '');
} catch (error) {
throw new Error(`Invalid URL: ${url}`);
}
}
static validateUrl(url: string): boolean {
try {
const urlObj = new URL(url);
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
} catch {
return false;
}
}
static buildPageUrl(baseUrl: string, pageNum: number, config: any): string {
try {
const urlObj = new URL(baseUrl);
if (config.pageParam) {
urlObj.searchParams.set(config.pageParam, pageNum.toString());
} else if (config.urlPattern) {
// Replace {page} or similar patterns in URL
return config.urlPattern.replace(/\{page\}/g, pageNum.toString());
}
return urlObj.toString();
} catch (error) {
throw new Error(`Failed to build page URL: ${error}`);
}
}
static normalizeUrl(url: string, baseUrl?: string): string {
try {
if (url.startsWith('http://') || url.startsWith('https://')) {
return url;
}
if (baseUrl) {
return new URL(url, baseUrl).toString();
}
return url;
} catch (error) {
throw new Error(`Failed to normalize URL: ${url}`);
}
}
static extractDomainConfig(url: string, sitesConfig: any): any {
const domain = this.getDomain(url);
const config = sitesConfig.sites[domain];
if (!config) {
throw new Error(`Unsupported site: ${domain}`);
}
return config;
}
}