Files
storycove/frontend/src/app/scrape/bulk/route.ts
2025-09-22 09:44:50 +02:00

636 lines
20 KiB
TypeScript

import { NextRequest, NextResponse } from 'next/server';
// Configure route timeout for long-running scraping operations
export const maxDuration = 900; // 15 minutes (900 seconds)
// Import progress tracking helper
import { sendProgressUpdate } from '../../../lib/progress';
interface BulkImportRequest {
urls: string[];
combineIntoOne?: boolean;
sessionId?: string; // For progress tracking
}
interface ImportResult {
url: string;
status: 'imported' | 'skipped' | 'error';
reason?: string;
title?: string;
author?: string;
error?: string;
storyId?: string;
}
interface BulkImportResponse {
results: ImportResult[];
summary: {
total: number;
imported: number;
skipped: number;
errors: number;
};
combinedStory?: {
title: string;
author: string;
content: string;
summary?: string;
sourceUrl: string;
tags?: string[];
};
}
// Background processing function for combined mode
async function processCombinedMode(
urls: string[],
sessionId: string,
authorization: string,
scraper: any
) {
const results: ImportResult[] = [];
let importedCount = 0;
let errorCount = 0;
const combinedContent: string[] = [];
let baseTitle = '';
let baseAuthor = '';
let baseSummary = '';
let baseSourceUrl = '';
const combinedTags = new Set<string>();
let totalWordCount = 0;
// Send initial progress update
await sendProgressUpdate(sessionId, {
type: 'progress',
current: 0,
total: urls.length,
message: `Starting to scrape ${urls.length} URLs for combining...`,
totalWordCount: 0
});
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
console.log(`Scraping URL ${i + 1}/${urls.length} for combine: ${url}`);
// Send progress update
await sendProgressUpdate(sessionId, {
type: 'progress',
current: i,
total: urls.length,
message: `Scraping URL ${i + 1} of ${urls.length}...`,
url: url,
totalWordCount
});
try {
const trimmedUrl = url.trim();
if (!trimmedUrl) {
results.push({
url: url || 'Empty URL',
status: 'error',
error: 'Empty URL in combined mode'
});
errorCount++;
continue;
}
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
// Check if we got content - this is required for combined mode
if (!scrapedStory.content || scrapedStory.content.trim() === '') {
results.push({
url: trimmedUrl,
status: 'error',
error: 'No content found - required for combined mode'
});
errorCount++;
continue;
}
// Use first URL for base metadata (title can be empty for combined mode)
if (i === 0) {
baseTitle = scrapedStory.title || 'Combined Story';
baseAuthor = scrapedStory.author || 'Unknown Author';
baseSummary = scrapedStory.summary || '';
baseSourceUrl = trimmedUrl;
}
// Add content with URL separator
combinedContent.push(`<!-- Content from: ${trimmedUrl} -->`);
if (scrapedStory.title && i > 0) {
combinedContent.push(`<h2>${scrapedStory.title}</h2>`);
}
combinedContent.push(scrapedStory.content);
combinedContent.push('<hr/>'); // Visual separator between parts
// Calculate word count for this story
const textContent = scrapedStory.content.replace(/<[^>]*>/g, ''); // Strip HTML
const wordCount = textContent.split(/\s+/).filter((word: string) => word.length > 0).length;
totalWordCount += wordCount;
// Collect tags from all stories
if (scrapedStory.tags) {
scrapedStory.tags.forEach((tag: string) => combinedTags.add(tag));
}
results.push({
url: trimmedUrl,
status: 'imported',
title: scrapedStory.title,
author: scrapedStory.author
});
importedCount++;
// Send progress update with word count
await sendProgressUpdate(sessionId, {
type: 'progress',
current: i + 1,
total: urls.length,
message: `Scraped "${scrapedStory.title}" (${wordCount.toLocaleString()} words)`,
url: trimmedUrl,
title: scrapedStory.title,
author: scrapedStory.author,
wordCount: wordCount,
totalWordCount: totalWordCount
});
} catch (error) {
console.error(`Error processing URL ${url} in combined mode:`, error);
results.push({
url: url,
status: 'error',
error: error instanceof Error ? error.message : 'Unknown error'
});
errorCount++;
}
}
// If we have any errors, fail the entire combined operation
if (errorCount > 0) {
await sendProgressUpdate(sessionId, {
type: 'error',
current: urls.length,
total: urls.length,
message: 'Combined mode failed: some URLs could not be processed',
error: `${errorCount} URLs failed to process`
});
return;
}
// Check content size to prevent response size issues
const combinedContentString = combinedContent.join('\n');
const contentSizeInMB = new Blob([combinedContentString]).size / (1024 * 1024);
console.log(`Combined content size: ${contentSizeInMB.toFixed(2)} MB`);
console.log(`Combined content character length: ${combinedContentString.length}`);
console.log(`Combined content parts count: ${combinedContent.length}`);
// Handle content truncation if needed
let finalContent = contentSizeInMB > 10 ?
combinedContentString.substring(0, Math.floor(combinedContentString.length * (10 / contentSizeInMB))) + '\n\n<!-- Content truncated due to size limit -->' :
combinedContentString;
let finalSummary = contentSizeInMB > 10 ? baseSummary + ' (Content truncated due to size limit)' : baseSummary;
// Check if combined content has images and mark for processing
const hasImages = /<img[^>]+src=['"'][^'"']*['"][^>]*>/i.test(finalContent);
if (hasImages) {
finalSummary += ' (Contains embedded images - will be processed after story creation)';
console.log(`Combined story contains embedded images - will need processing after creation`);
}
// Return the combined story data via progress update
const combinedStory = {
title: baseTitle,
author: baseAuthor,
content: finalContent,
summary: finalSummary,
sourceUrl: baseSourceUrl,
tags: Array.from(combinedTags),
hasImages: hasImages
};
// Send completion notification for combine mode
let completionMessage = `Combined scraping completed: ${totalWordCount.toLocaleString()} words from ${importedCount} stories`;
if (hasImages) {
completionMessage += ` (embedded images will be processed when story is created)`;
}
await sendProgressUpdate(sessionId, {
type: 'completed',
current: urls.length,
total: urls.length,
message: completionMessage,
totalWordCount: totalWordCount,
combinedStory: combinedStory
});
console.log(`Combined scraping completed: ${importedCount} URLs combined into one story`);
}
// Background processing function for individual mode
async function processIndividualMode(
urls: string[],
sessionId: string,
authorization: string,
scraper: any
) {
const results: ImportResult[] = [];
let importedCount = 0;
let skippedCount = 0;
let errorCount = 0;
await sendProgressUpdate(sessionId, {
type: 'progress',
current: 0,
total: urls.length,
message: `Starting to import ${urls.length} URLs individually...`
});
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
console.log(`Processing URL ${i + 1}/${urls.length}: ${url}`);
await sendProgressUpdate(sessionId, {
type: 'progress',
current: i,
total: urls.length,
message: `Processing URL ${i + 1} of ${urls.length}...`,
url: url
});
try {
// Validate URL format
if (!url || typeof url !== 'string' || url.trim() === '') {
results.push({
url: url || 'Empty URL',
status: 'error',
error: 'Invalid URL format'
});
errorCount++;
continue;
}
const trimmedUrl = url.trim();
// Scrape the story
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
// Validate required fields
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
const missingFields = [];
if (!scrapedStory.title) missingFields.push('title');
if (!scrapedStory.author) missingFields.push('author');
if (!scrapedStory.content) missingFields.push('content');
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Missing required fields: ${missingFields.join(', ')}`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
// Check for duplicates using query parameters
try {
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
const params = new URLSearchParams({
title: scrapedStory.title,
authorName: scrapedStory.author
});
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (duplicateCheckResponse.ok) {
const duplicateResult = await duplicateCheckResponse.json();
if (duplicateResult.hasDuplicates) {
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Duplicate story found (${duplicateResult.count} existing)`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
}
} catch (error) {
console.warn('Duplicate check failed:', error);
// Continue with import if duplicate check fails
}
// Create the story
try {
const storyData = {
title: scrapedStory.title,
summary: scrapedStory.summary || undefined,
contentHtml: scrapedStory.content,
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
authorName: scrapedStory.author,
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
};
const createUrl = `http://backend:8080/api/stories`;
const createResponse = await fetch(createUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
body: JSON.stringify(storyData),
});
if (!createResponse.ok) {
const errorData = await createResponse.json();
throw new Error(errorData.message || 'Failed to create story');
}
const createdStory = await createResponse.json();
// Process embedded images if content contains images
let imageProcessingWarnings: string[] = [];
const hasImages = /<img[^>]+src=['"'][^'"']*['"][^>]*>/i.test(scrapedStory.content);
if (hasImages) {
try {
console.log(`Processing embedded images for story: ${createdStory.id}`);
const imageProcessUrl = `http://backend:8080/api/stories/${createdStory.id}/process-content-images`;
const imageProcessResponse = await fetch(imageProcessUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
body: JSON.stringify({ htmlContent: scrapedStory.content }),
});
if (imageProcessResponse.ok) {
const imageResult = await imageProcessResponse.json();
if (imageResult.hasWarnings && imageResult.warnings) {
imageProcessingWarnings = imageResult.warnings;
console.log(`Image processing completed with warnings for story ${createdStory.id}:`, imageResult.warnings);
} else {
console.log(`Image processing completed successfully for story ${createdStory.id}. Downloaded ${imageResult.downloadedImages?.length || 0} images.`);
}
// Update story content with processed images
if (imageResult.processedContent && imageResult.processedContent !== scrapedStory.content) {
const updateUrl = `http://backend:8080/api/stories/${createdStory.id}`;
const updateResponse = await fetch(updateUrl, {
method: 'PUT',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
body: JSON.stringify({
contentHtml: imageResult.processedContent
}),
});
if (!updateResponse.ok) {
console.warn(`Failed to update story content after image processing for ${createdStory.id}`);
imageProcessingWarnings.push('Failed to update story content with processed images');
}
}
} else {
console.warn(`Image processing failed for story ${createdStory.id}:`, imageProcessResponse.status);
imageProcessingWarnings.push('Image processing failed');
}
} catch (error) {
console.error(`Error processing images for story ${createdStory.id}:`, error);
imageProcessingWarnings.push(`Image processing error: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
results.push({
url: trimmedUrl,
status: 'imported',
title: scrapedStory.title,
author: scrapedStory.author,
storyId: createdStory.id
});
importedCount++;
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})${hasImages ? ` with ${imageProcessingWarnings.length > 0 ? 'warnings' : 'successful image processing'}` : ''}`);
// Send progress update for successful import
let progressMessage = `Imported "${scrapedStory.title}" by ${scrapedStory.author}`;
if (hasImages) {
progressMessage += imageProcessingWarnings.length > 0 ? ' (with image warnings)' : ' (with images)';
}
await sendProgressUpdate(sessionId, {
type: 'progress',
current: i + 1,
total: urls.length,
message: progressMessage,
url: trimmedUrl,
title: scrapedStory.title,
author: scrapedStory.author,
hasImages: hasImages,
imageWarnings: imageProcessingWarnings
});
} catch (error) {
console.error(`Failed to create story for ${trimmedUrl}:`, error);
let errorMessage = 'Failed to create story';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: trimmedUrl,
status: 'error',
error: errorMessage,
title: scrapedStory.title,
author: scrapedStory.author
});
errorCount++;
}
} catch (error) {
console.error(`Error processing URL ${url}:`, error);
let errorMessage = 'Unknown error';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: url,
status: 'error',
error: errorMessage
});
errorCount++;
}
}
// Send completion notification
await sendProgressUpdate(sessionId, {
type: 'completed',
current: urls.length,
total: urls.length,
message: `Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`,
results: results,
summary: {
total: urls.length,
imported: importedCount,
skipped: skippedCount,
errors: errorCount
}
});
console.log(`Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`);
// Trigger Solr reindex if any stories were imported
if (importedCount > 0) {
try {
console.log('Triggering Solr reindex after bulk import...');
const reindexUrl = `http://backend:8080/api/admin/search/solr/reindex`;
const reindexResponse = await fetch(reindexUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (reindexResponse.ok) {
const reindexResult = await reindexResponse.json();
console.log('Solr reindex completed:', reindexResult);
} else {
console.warn('Solr reindex failed:', reindexResponse.status);
}
} catch (error) {
console.warn('Failed to trigger Solr reindex:', error);
// Don't fail the whole request if reindex fails
}
}
}
// Background processing function
async function processBulkImport(
urls: string[],
combineIntoOne: boolean,
sessionId: string,
authorization: string
) {
try {
// Dynamic imports to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const scraper = new StoryScraper();
console.log(`Starting bulk scraping for ${urls.length} URLs${combineIntoOne ? ' (combine mode)' : ''}`);
console.log(`Session ID: ${sessionId}`);
// Quick test to verify backend connectivity
try {
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
console.log(`Backend test response status: ${testResponse.status}`);
} catch (error) {
console.error(`Backend connectivity test failed:`, error);
}
// Handle combined mode
if (combineIntoOne) {
await processCombinedMode(urls, sessionId, authorization, scraper);
} else {
// Normal individual processing mode
await processIndividualMode(urls, sessionId, authorization, scraper);
}
} catch (error) {
console.error('Background bulk import error:', error);
await sendProgressUpdate(sessionId, {
type: 'error',
current: 0,
total: urls.length,
message: 'Bulk import failed due to an error',
error: error instanceof Error ? error.message : 'Unknown error'
});
}
}
export async function POST(request: NextRequest) {
try {
// Check for authentication
const authorization = request.headers.get('authorization');
if (!authorization) {
return NextResponse.json(
{ error: 'Authentication required for bulk import' },
{ status: 401 }
);
}
const body = await request.json();
const { urls, combineIntoOne = false, sessionId } = body as BulkImportRequest;
if (!urls || !Array.isArray(urls) || urls.length === 0) {
return NextResponse.json(
{ error: 'URLs array is required and must not be empty' },
{ status: 400 }
);
}
if (urls.length > 200) {
return NextResponse.json(
{ error: 'Maximum 200 URLs allowed per bulk import' },
{ status: 400 }
);
}
if (!sessionId) {
return NextResponse.json(
{ error: 'Session ID is required for progress tracking' },
{ status: 400 }
);
}
// Start the background processing
processBulkImport(urls, combineIntoOne, sessionId, authorization).catch(error => {
console.error('Failed to start background processing:', error);
});
// Return immediately with session info
return NextResponse.json({
message: 'Bulk import started',
sessionId: sessionId,
totalUrls: urls.length,
combineMode: combineIntoOne
});
} catch (error) {
console.error('Bulk import initialization error:', error);
if (error instanceof Error) {
return NextResponse.json(
{ error: `Bulk import failed to start: ${error.message}` },
{ status: 500 }
);
}
return NextResponse.json(
{ error: 'Bulk import failed to start due to an unknown error' },
{ status: 500 }
);
}
}