scraping and improvements

This commit is contained in:
Stefan Hardegger
2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions

View File

@@ -12,6 +12,9 @@ import ImageUpload from '../../components/ui/ImageUpload';
import { storyApi, authorApi } from '../../lib/api';
export default function AddStoryPage() {
const [importMode, setImportMode] = useState<'manual' | 'url'>('manual');
const [importUrl, setImportUrl] = useState('');
const [scraping, setScraping] = useState(false);
const [formData, setFormData] = useState({
title: '',
summary: '',
@@ -130,6 +133,57 @@ export default function AddStoryPage() {
setFormData(prev => ({ ...prev, tags }));
};
const handleImportFromUrl = async () => {
if (!importUrl.trim()) {
setErrors({ importUrl: 'URL is required' });
return;
}
setScraping(true);
setErrors({});
try {
const response = await fetch('/scrape/story', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ url: importUrl }),
});
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Failed to scrape story');
}
const scrapedStory = await response.json();
// Pre-fill the form with scraped data
setFormData({
title: scrapedStory.title || '',
summary: scrapedStory.summary || '',
authorName: scrapedStory.author || '',
contentHtml: scrapedStory.content || '',
sourceUrl: scrapedStory.sourceUrl || importUrl,
tags: scrapedStory.tags || [],
seriesName: '',
volume: '',
});
// Switch to manual mode so user can edit the pre-filled data
setImportMode('manual');
setImportUrl('');
// Show success message
setErrors({ success: 'Story data imported successfully! Review and edit as needed before saving.' });
} catch (error: any) {
console.error('Failed to import story:', error);
setErrors({ importUrl: error.message });
} finally {
setScraping(false);
}
};
const validateForm = () => {
const newErrors: Record<string, string> = {};
@@ -206,7 +260,105 @@ export default function AddStoryPage() {
</p>
</div>
<form onSubmit={handleSubmit} className="space-y-6">
{/* Import Mode Toggle */}
<div className="mb-8">
<div className="flex border-b border-gray-200 dark:border-gray-700">
<button
type="button"
onClick={() => setImportMode('manual')}
className={`px-6 py-3 text-sm font-medium border-b-2 transition-colors ${
importMode === 'manual'
? 'border-theme-accent text-theme-accent'
: 'border-transparent theme-text hover:text-theme-accent'
}`}
>
Manual Entry
</button>
<button
type="button"
onClick={() => setImportMode('url')}
className={`px-6 py-3 text-sm font-medium border-b-2 transition-colors ${
importMode === 'url'
? 'border-theme-accent text-theme-accent'
: 'border-transparent theme-text hover:text-theme-accent'
}`}
>
Import from URL
</button>
</div>
</div>
{/* URL Import Section */}
{importMode === 'url' && (
<div className="bg-gray-50 dark:bg-gray-800/50 rounded-lg p-6 mb-8">
<h3 className="text-lg font-medium theme-header mb-4">Import Story from URL</h3>
<p className="theme-text text-sm mb-4">
Enter a URL from a supported story site to automatically extract the story content, title, author, and other metadata.
</p>
<div className="space-y-4">
<Input
label="Story URL"
type="url"
value={importUrl}
onChange={(e) => setImportUrl(e.target.value)}
placeholder="https://example.com/story-url"
error={errors.importUrl}
disabled={scraping}
/>
<div className="flex gap-3">
<Button
type="button"
onClick={handleImportFromUrl}
loading={scraping}
disabled={!importUrl.trim() || scraping}
>
{scraping ? 'Importing...' : 'Import Story'}
</Button>
<Button
type="button"
variant="ghost"
onClick={() => setImportMode('manual')}
disabled={scraping}
>
Enter Manually Instead
</Button>
</div>
<div className="border-t pt-4 mt-4">
<p className="text-sm theme-text mb-2">
Need to import multiple stories at once?
</p>
<Button
type="button"
variant="secondary"
onClick={() => router.push('/stories/import/bulk')}
disabled={scraping}
size="sm"
>
Bulk Import Multiple URLs
</Button>
</div>
<div className="text-xs theme-text">
<p className="font-medium mb-1">Supported Sites:</p>
<p>Archive of Our Own, DeviantArt, FanFiction.Net, Literotica, Royal Road, Wattpad, and more</p>
</div>
</div>
</div>
)}
{/* Success Message */}
{errors.success && (
<div className="p-4 bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg mb-6">
<p className="text-green-800 dark:text-green-200">{errors.success}</p>
</div>
)}
{importMode === 'manual' && (
<form onSubmit={handleSubmit} className="space-y-6">
{/* Title */}
<Input
label="Title *"
@@ -379,6 +531,7 @@ export default function AddStoryPage() {
</Button>
</div>
</form>
)}
</div>
</AppLayout>
);

View File

@@ -0,0 +1,72 @@
import { NextRequest, NextResponse } from 'next/server';
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { url } = body;
if (!url || typeof url !== 'string') {
return NextResponse.json(
{ error: 'URL is required and must be a string' },
{ status: 400 }
);
}
// Dynamic import to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const scraper = new StoryScraper();
const stories = await scraper.scrapeAuthorPage(url);
return NextResponse.json({ stories });
} catch (error) {
console.error('Author page scraping error:', error);
// Check if it's a ScraperError without importing at module level
if (error && typeof error === 'object' && error.constructor.name === 'ScraperError') {
return NextResponse.json(
{
error: (error as any).message,
url: (error as any).url
},
{ status: 400 }
);
}
if (error instanceof Error) {
// Handle specific error types
if (error.message.includes('Invalid URL')) {
return NextResponse.json(
{ error: 'Invalid URL provided' },
{ status: 400 }
);
}
if (error.message.includes('not supported')) {
return NextResponse.json(
{ error: 'Author page scraping is not supported for this website' },
{ status: 400 }
);
}
if (error.message.includes('HTTP 404')) {
return NextResponse.json(
{ error: 'Author page not found at the provided URL' },
{ status: 404 }
);
}
if (error.message.includes('timeout')) {
return NextResponse.json(
{ error: 'Request timed out while fetching content' },
{ status: 408 }
);
}
}
return NextResponse.json(
{ error: 'Failed to scrape author page. Please try again.' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,292 @@
import { NextRequest, NextResponse } from 'next/server';
interface BulkImportRequest {
urls: string[];
}
interface ImportResult {
url: string;
status: 'imported' | 'skipped' | 'error';
reason?: string;
title?: string;
author?: string;
error?: string;
storyId?: string;
}
interface BulkImportResponse {
results: ImportResult[];
summary: {
total: number;
imported: number;
skipped: number;
errors: number;
};
}
export async function POST(request: NextRequest) {
try {
// Check for authentication
const authorization = request.headers.get('authorization');
if (!authorization) {
return NextResponse.json(
{ error: 'Authentication required for bulk import' },
{ status: 401 }
);
}
const body = await request.json();
const { urls } = body as BulkImportRequest;
if (!urls || !Array.isArray(urls) || urls.length === 0) {
return NextResponse.json(
{ error: 'URLs array is required and must not be empty' },
{ status: 400 }
);
}
if (urls.length > 50) {
return NextResponse.json(
{ error: 'Maximum 50 URLs allowed per bulk import' },
{ status: 400 }
);
}
// Dynamic imports to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const scraper = new StoryScraper();
const results: ImportResult[] = [];
let importedCount = 0;
let skippedCount = 0;
let errorCount = 0;
console.log(`Starting bulk scraping for ${urls.length} URLs`);
console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
// For server-side API calls in Docker, use direct backend container URL
// Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
const serverSideApiBaseUrl = 'http://backend:8080/api';
console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
// Quick test to verify backend connectivity
try {
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
console.log(`Backend test response status: ${testResponse.status}`);
} catch (error) {
console.error(`Backend connectivity test failed:`, error);
}
for (const url of urls) {
console.log(`Processing URL: ${url}`);
try {
// Validate URL format
if (!url || typeof url !== 'string' || url.trim() === '') {
results.push({
url: url || 'Empty URL',
status: 'error',
error: 'Invalid URL format'
});
errorCount++;
continue;
}
const trimmedUrl = url.trim();
// Scrape the story
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
// Validate required fields
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
const missingFields = [];
if (!scrapedStory.title) missingFields.push('title');
if (!scrapedStory.author) missingFields.push('author');
if (!scrapedStory.content) missingFields.push('content');
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Missing required fields: ${missingFields.join(', ')}`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
// Check for duplicates using query parameters
try {
// Use hardcoded backend URL for container-to-container communication
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
const params = new URLSearchParams({
title: scrapedStory.title,
authorName: scrapedStory.author
});
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (duplicateCheckResponse.ok) {
const duplicateResult = await duplicateCheckResponse.json();
if (duplicateResult.hasDuplicates) {
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Duplicate story found (${duplicateResult.count} existing)`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
}
} catch (error) {
console.warn('Duplicate check failed:', error);
// Continue with import if duplicate check fails
}
// Create the story
try {
const storyData = {
title: scrapedStory.title,
summary: scrapedStory.summary || undefined,
contentHtml: scrapedStory.content,
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
authorName: scrapedStory.author,
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
};
// Use hardcoded backend URL for container-to-container communication
const createUrl = `http://backend:8080/api/stories`;
console.log(`Create story URL: ${createUrl}`);
const createResponse = await fetch(createUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
body: JSON.stringify(storyData),
});
if (!createResponse.ok) {
const errorData = await createResponse.json();
throw new Error(errorData.message || 'Failed to create story');
}
const createdStory = await createResponse.json();
results.push({
url: trimmedUrl,
status: 'imported',
title: scrapedStory.title,
author: scrapedStory.author,
storyId: createdStory.id
});
importedCount++;
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
} catch (error) {
console.error(`Failed to create story for ${trimmedUrl}:`, error);
let errorMessage = 'Failed to create story';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: trimmedUrl,
status: 'error',
error: errorMessage,
title: scrapedStory.title,
author: scrapedStory.author
});
errorCount++;
}
} catch (error) {
console.error(`Error processing URL ${url}:`, error);
let errorMessage = 'Unknown error';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: url,
status: 'error',
error: errorMessage
});
errorCount++;
}
}
const response: BulkImportResponse = {
results,
summary: {
total: urls.length,
imported: importedCount,
skipped: skippedCount,
errors: errorCount
}
};
console.log(`Bulk import completed:`, response.summary);
// Trigger Typesense reindex if any stories were imported
if (importedCount > 0) {
try {
console.log('Triggering Typesense reindex after bulk import...');
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
const reindexResponse = await fetch(reindexUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (reindexResponse.ok) {
const reindexResult = await reindexResponse.json();
console.log('Typesense reindex completed:', reindexResult);
} else {
console.warn('Typesense reindex failed:', reindexResponse.status);
}
} catch (error) {
console.warn('Failed to trigger Typesense reindex:', error);
// Don't fail the whole request if reindex fails
}
}
return NextResponse.json(response);
} catch (error) {
console.error('Bulk import error:', error);
if (error instanceof Error) {
return NextResponse.json(
{ error: `Bulk import failed: ${error.message}` },
{ status: 500 }
);
}
return NextResponse.json(
{ error: 'Bulk import failed due to an unknown error' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,85 @@
import { NextRequest, NextResponse } from 'next/server';
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { url } = body;
if (!url || typeof url !== 'string') {
return NextResponse.json(
{ error: 'URL is required and must be a string' },
{ status: 400 }
);
}
// Dynamic import to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const { ScraperError } = await import('@/lib/scraper/types');
const scraper = new StoryScraper();
const story = await scraper.scrapeStory(url);
// Debug logging
console.log('Scraped story data:', {
url: url,
title: story.title,
author: story.author,
summary: story.summary,
contentLength: story.content?.length || 0,
contentPreview: story.content?.substring(0, 200) + '...',
tags: story.tags,
coverImage: story.coverImage
});
return NextResponse.json(story);
} catch (error) {
console.error('Story scraping error:', error);
// Check if it's a ScraperError without importing at module level
if (error && typeof error === 'object' && error.constructor.name === 'ScraperError') {
return NextResponse.json(
{
error: (error as any).message,
url: (error as any).url
},
{ status: 400 }
);
}
if (error instanceof Error) {
// Handle specific error types
if (error.message.includes('Invalid URL')) {
return NextResponse.json(
{ error: 'Invalid URL provided' },
{ status: 400 }
);
}
if (error.message.includes('Unsupported site')) {
return NextResponse.json(
{ error: 'This website is not supported for scraping' },
{ status: 400 }
);
}
if (error.message.includes('HTTP 404')) {
return NextResponse.json(
{ error: 'Story not found at the provided URL' },
{ status: 404 }
);
}
if (error.message.includes('timeout')) {
return NextResponse.json(
{ error: 'Request timed out while fetching content' },
{ status: 408 }
);
}
}
return NextResponse.json(
{ error: 'Failed to scrape story. Please try again.' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,300 @@
'use client';
import { useState } from 'react';
import { useRouter } from 'next/navigation';
import Link from 'next/link';
import { ArrowLeftIcon } from '@heroicons/react/24/outline';
interface ImportResult {
url: string;
status: 'imported' | 'skipped' | 'error';
reason?: string;
title?: string;
author?: string;
error?: string;
storyId?: string;
}
interface BulkImportResponse {
results: ImportResult[];
summary: {
total: number;
imported: number;
skipped: number;
errors: number;
};
}
export default function BulkImportPage() {
const router = useRouter();
const [urls, setUrls] = useState('');
const [isLoading, setIsLoading] = useState(false);
const [results, setResults] = useState<BulkImportResponse | null>(null);
const [error, setError] = useState<string | null>(null);
const handleSubmit = async (e: React.FormEvent) => {
e.preventDefault();
if (!urls.trim()) {
setError('Please enter at least one URL');
return;
}
setIsLoading(true);
setError(null);
setResults(null);
try {
// Parse URLs from textarea (one per line)
const urlList = urls
.split('\n')
.map(url => url.trim())
.filter(url => url.length > 0);
if (urlList.length === 0) {
setError('Please enter at least one valid URL');
setIsLoading(false);
return;
}
if (urlList.length > 50) {
setError('Maximum 50 URLs allowed per bulk import');
setIsLoading(false);
return;
}
// Get auth token for server-side API calls
const token = localStorage.getItem('auth-token');
const response = await fetch('/scrape/bulk', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': token ? `Bearer ${token}` : '',
},
body: JSON.stringify({ urls: urlList }),
});
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || 'Bulk import failed');
}
const data: BulkImportResponse = await response.json();
setResults(data);
} catch (err) {
console.error('Bulk import error:', err);
setError(err instanceof Error ? err.message : 'Failed to import stories');
} finally {
setIsLoading(false);
}
};
const handleReset = () => {
setUrls('');
setResults(null);
setError(null);
};
const getStatusColor = (status: string) => {
switch (status) {
case 'imported': return 'text-green-700 bg-green-50 border-green-200';
case 'skipped': return 'text-yellow-700 bg-yellow-50 border-yellow-200';
case 'error': return 'text-red-700 bg-red-50 border-red-200';
default: return 'text-gray-700 bg-gray-50 border-gray-200';
}
};
const getStatusIcon = (status: string) => {
switch (status) {
case 'imported': return '✓';
case 'skipped': return '⚠';
case 'error': return '✗';
default: return '';
}
};
return (
<div className="container mx-auto px-4 py-6">
<div className="max-w-4xl mx-auto">
{/* Header */}
<div className="mb-6">
<div className="flex items-center gap-4 mb-4">
<Link
href="/library"
className="inline-flex items-center text-blue-600 hover:text-blue-800"
>
<ArrowLeftIcon className="h-4 w-4 mr-1" />
Back to Library
</Link>
</div>
<h1 className="text-3xl font-bold text-gray-900 mb-2">Bulk Import Stories</h1>
<p className="text-gray-600">
Import multiple stories at once by providing a list of URLs. Each URL will be scraped
and automatically added to your story collection.
</p>
</div>
{!results ? (
// Import Form
<form onSubmit={handleSubmit} className="space-y-6">
<div>
<label htmlFor="urls" className="block text-sm font-medium text-gray-700 mb-2">
Story URLs
</label>
<p className="text-sm text-gray-500 mb-3">
Enter one URL per line. Maximum 50 URLs per import.
</p>
<textarea
id="urls"
value={urls}
onChange={(e) => setUrls(e.target.value)}
placeholder="https://example.com/story1&#10;https://example.com/story2&#10;https://example.com/story3"
className="w-full h-64 px-3 py-2 border border-gray-300 rounded-md shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent"
disabled={isLoading}
/>
<p className="mt-2 text-sm text-gray-500">
URLs: {urls.split('\n').filter(url => url.trim().length > 0).length}
</p>
</div>
{error && (
<div className="bg-red-50 border border-red-200 rounded-md p-4">
<div className="flex">
<div className="ml-3">
<h3 className="text-sm font-medium text-red-800">Error</h3>
<div className="mt-2 text-sm text-red-700">
{error}
</div>
</div>
</div>
</div>
)}
<div className="flex gap-4">
<button
type="submit"
disabled={isLoading || !urls.trim()}
className="px-6 py-2 bg-blue-600 text-white font-medium rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2 disabled:opacity-50 disabled:cursor-not-allowed"
>
{isLoading ? 'Importing...' : 'Start Import'}
</button>
<button
type="button"
onClick={handleReset}
disabled={isLoading}
className="px-6 py-2 bg-gray-600 text-white font-medium rounded-md hover:bg-gray-700 focus:outline-none focus:ring-2 focus:ring-gray-500 focus:ring-offset-2 disabled:opacity-50 disabled:cursor-not-allowed"
>
Clear
</button>
</div>
{isLoading && (
<div className="bg-blue-50 border border-blue-200 rounded-md p-4">
<div className="flex items-center">
<div className="animate-spin rounded-full h-5 w-5 border-b-2 border-blue-600 mr-3"></div>
<div>
<p className="text-sm font-medium text-blue-800">Processing URLs...</p>
<p className="text-sm text-blue-600">
This may take a few minutes depending on the number of URLs and response times of the source websites.
</p>
</div>
</div>
</div>
)}
</form>
) : (
// Results
<div className="space-y-6">
{/* Summary */}
<div className="bg-white border border-gray-200 rounded-lg p-6">
<h2 className="text-xl font-semibold text-gray-900 mb-4">Import Summary</h2>
<div className="grid grid-cols-2 md:grid-cols-4 gap-4">
<div className="text-center">
<div className="text-2xl font-bold text-gray-900">{results.summary.total}</div>
<div className="text-sm text-gray-600">Total URLs</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-green-600">{results.summary.imported}</div>
<div className="text-sm text-gray-600">Imported</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-yellow-600">{results.summary.skipped}</div>
<div className="text-sm text-gray-600">Skipped</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-red-600">{results.summary.errors}</div>
<div className="text-sm text-gray-600">Errors</div>
</div>
</div>
</div>
{/* Detailed Results */}
<div className="bg-white border border-gray-200 rounded-lg">
<div className="px-6 py-4 border-b border-gray-200">
<h3 className="text-lg font-medium text-gray-900">Detailed Results</h3>
</div>
<div className="divide-y divide-gray-200">
{results.results.map((result, index) => (
<div key={index} className="p-6">
<div className="flex items-start justify-between">
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2 mb-2">
<span className={`inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium border ${getStatusColor(result.status)}`}>
{getStatusIcon(result.status)} {result.status.charAt(0).toUpperCase() + result.status.slice(1)}
</span>
</div>
<p className="text-sm text-gray-900 font-medium truncate mb-1">
{result.url}
</p>
{result.title && result.author && (
<p className="text-sm text-gray-600 mb-1">
"{result.title}" by {result.author}
</p>
)}
{result.reason && (
<p className="text-sm text-gray-500">
{result.reason}
</p>
)}
{result.error && (
<p className="text-sm text-red-600">
Error: {result.error}
</p>
)}
</div>
</div>
</div>
))}
</div>
</div>
{/* Actions */}
<div className="flex gap-4">
<button
onClick={handleReset}
className="px-6 py-2 bg-blue-600 text-white font-medium rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2"
>
Import More URLs
</button>
<Link
href="/stories"
className="px-6 py-2 bg-gray-600 text-white font-medium rounded-md hover:bg-gray-700 focus:outline-none focus:ring-2 focus:ring-gray-500 focus:ring-offset-2"
>
View Stories
</Link>
</div>
</div>
)}
</div>
</div>
);
}