import { NextRequest, NextResponse } from 'next/server'; // Configure route timeout for long-running scraping operations export const maxDuration = 900; // 15 minutes (900 seconds) // Import progress tracking helper import { sendProgressUpdate } from '../../../lib/progress'; interface BulkImportRequest { urls: string[]; combineIntoOne?: boolean; sessionId?: string; // For progress tracking } interface ImportResult { url: string; status: 'imported' | 'skipped' | 'error'; reason?: string; title?: string; author?: string; error?: string; storyId?: string; } interface BulkImportResponse { results: ImportResult[]; summary: { total: number; imported: number; skipped: number; errors: number; }; combinedStory?: { title: string; author: string; content: string; summary?: string; sourceUrl: string; tags?: string[]; }; } // Background processing function for combined mode async function processCombinedMode( urls: string[], sessionId: string, authorization: string, scraper: any ) { const results: ImportResult[] = []; let importedCount = 0; let errorCount = 0; const combinedContent: string[] = []; let baseTitle = ''; let baseAuthor = ''; let baseSummary = ''; let baseSourceUrl = ''; const combinedTags = new Set(); let totalWordCount = 0; // Send initial progress update await sendProgressUpdate(sessionId, { type: 'progress', current: 0, total: urls.length, message: `Starting to scrape ${urls.length} URLs for combining...`, totalWordCount: 0 }); for (let i = 0; i < urls.length; i++) { const url = urls[i]; console.log(`Scraping URL ${i + 1}/${urls.length} for combine: ${url}`); // Send progress update await sendProgressUpdate(sessionId, { type: 'progress', current: i, total: urls.length, message: `Scraping URL ${i + 1} of ${urls.length}...`, url: url, totalWordCount }); try { const trimmedUrl = url.trim(); if (!trimmedUrl) { results.push({ url: url || 'Empty URL', status: 'error', error: 'Empty URL in combined mode' }); errorCount++; continue; } const scrapedStory = await scraper.scrapeStory(trimmedUrl); // Check if we got content - this is required for combined mode if (!scrapedStory.content || scrapedStory.content.trim() === '') { results.push({ url: trimmedUrl, status: 'error', error: 'No content found - required for combined mode' }); errorCount++; continue; } // Use first URL for base metadata (title can be empty for combined mode) if (i === 0) { baseTitle = scrapedStory.title || 'Combined Story'; baseAuthor = scrapedStory.author || 'Unknown Author'; baseSummary = scrapedStory.summary || ''; baseSourceUrl = trimmedUrl; } // Add content with URL separator combinedContent.push(``); if (scrapedStory.title && i > 0) { combinedContent.push(`

${scrapedStory.title}

`); } combinedContent.push(scrapedStory.content); combinedContent.push('
'); // Visual separator between parts // Calculate word count for this story const textContent = scrapedStory.content.replace(/<[^>]*>/g, ''); // Strip HTML const wordCount = textContent.split(/\s+/).filter((word: string) => word.length > 0).length; totalWordCount += wordCount; // Collect tags from all stories if (scrapedStory.tags) { scrapedStory.tags.forEach((tag: string) => combinedTags.add(tag)); } results.push({ url: trimmedUrl, status: 'imported', title: scrapedStory.title, author: scrapedStory.author }); importedCount++; // Send progress update with word count await sendProgressUpdate(sessionId, { type: 'progress', current: i + 1, total: urls.length, message: `Scraped "${scrapedStory.title}" (${wordCount.toLocaleString()} words)`, url: trimmedUrl, title: scrapedStory.title, author: scrapedStory.author, wordCount: wordCount, totalWordCount: totalWordCount }); } catch (error) { console.error(`Error processing URL ${url} in combined mode:`, error); results.push({ url: url, status: 'error', error: error instanceof Error ? error.message : 'Unknown error' }); errorCount++; } } // If we have any errors, fail the entire combined operation if (errorCount > 0) { await sendProgressUpdate(sessionId, { type: 'error', current: urls.length, total: urls.length, message: 'Combined mode failed: some URLs could not be processed', error: `${errorCount} URLs failed to process` }); return; } // Check content size to prevent response size issues const combinedContentString = combinedContent.join('\n'); const contentSizeInMB = new Blob([combinedContentString]).size / (1024 * 1024); console.log(`Combined content size: ${contentSizeInMB.toFixed(2)} MB`); console.log(`Combined content character length: ${combinedContentString.length}`); console.log(`Combined content parts count: ${combinedContent.length}`); // Handle content truncation if needed let finalContent = contentSizeInMB > 10 ? combinedContentString.substring(0, Math.floor(combinedContentString.length * (10 / contentSizeInMB))) + '\n\n' : combinedContentString; let finalSummary = contentSizeInMB > 10 ? baseSummary + ' (Content truncated due to size limit)' : baseSummary; // Check if combined content has images and mark for processing const hasImages = /]+src=['"'][^'"']*['"][^>]*>/i.test(finalContent); if (hasImages) { finalSummary += ' (Contains embedded images - will be processed after story creation)'; console.log(`Combined story contains embedded images - will need processing after creation`); } // Return the combined story data via progress update const combinedStory = { title: baseTitle, author: baseAuthor, content: finalContent, summary: finalSummary, sourceUrl: baseSourceUrl, tags: Array.from(combinedTags), hasImages: hasImages }; // Send completion notification for combine mode let completionMessage = `Combined scraping completed: ${totalWordCount.toLocaleString()} words from ${importedCount} stories`; if (hasImages) { completionMessage += ` (embedded images will be processed when story is created)`; } await sendProgressUpdate(sessionId, { type: 'completed', current: urls.length, total: urls.length, message: completionMessage, totalWordCount: totalWordCount, combinedStory: combinedStory }); console.log(`Combined scraping completed: ${importedCount} URLs combined into one story`); } // Background processing function for individual mode async function processIndividualMode( urls: string[], sessionId: string, authorization: string, scraper: any ) { const results: ImportResult[] = []; let importedCount = 0; let skippedCount = 0; let errorCount = 0; await sendProgressUpdate(sessionId, { type: 'progress', current: 0, total: urls.length, message: `Starting to import ${urls.length} URLs individually...` }); for (let i = 0; i < urls.length; i++) { const url = urls[i]; console.log(`Processing URL ${i + 1}/${urls.length}: ${url}`); await sendProgressUpdate(sessionId, { type: 'progress', current: i, total: urls.length, message: `Processing URL ${i + 1} of ${urls.length}...`, url: url }); try { // Validate URL format if (!url || typeof url !== 'string' || url.trim() === '') { results.push({ url: url || 'Empty URL', status: 'error', error: 'Invalid URL format' }); errorCount++; continue; } const trimmedUrl = url.trim(); // Scrape the story const scrapedStory = await scraper.scrapeStory(trimmedUrl); // Validate required fields if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) { const missingFields = []; if (!scrapedStory.title) missingFields.push('title'); if (!scrapedStory.author) missingFields.push('author'); if (!scrapedStory.content) missingFields.push('content'); results.push({ url: trimmedUrl, status: 'skipped', reason: `Missing required fields: ${missingFields.join(', ')}`, title: scrapedStory.title, author: scrapedStory.author }); skippedCount++; continue; } // Check for duplicates using query parameters try { const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`; const params = new URLSearchParams({ title: scrapedStory.title, authorName: scrapedStory.author }); const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, { method: 'GET', headers: { 'Authorization': authorization, 'Content-Type': 'application/json', }, }); if (duplicateCheckResponse.ok) { const duplicateResult = await duplicateCheckResponse.json(); if (duplicateResult.hasDuplicates) { results.push({ url: trimmedUrl, status: 'skipped', reason: `Duplicate story found (${duplicateResult.count} existing)`, title: scrapedStory.title, author: scrapedStory.author }); skippedCount++; continue; } } } catch (error) { console.warn('Duplicate check failed:', error); // Continue with import if duplicate check fails } // Create the story try { const storyData = { title: scrapedStory.title, summary: scrapedStory.summary || undefined, contentHtml: scrapedStory.content, sourceUrl: scrapedStory.sourceUrl || trimmedUrl, authorName: scrapedStory.author, tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined, }; const createUrl = `http://backend:8080/api/stories`; const createResponse = await fetch(createUrl, { method: 'POST', headers: { 'Authorization': authorization, 'Content-Type': 'application/json', }, body: JSON.stringify(storyData), }); if (!createResponse.ok) { const errorData = await createResponse.json(); throw new Error(errorData.message || 'Failed to create story'); } const createdStory = await createResponse.json(); // Process embedded images if content contains images let imageProcessingWarnings: string[] = []; const hasImages = /]+src=['"'][^'"']*['"][^>]*>/i.test(scrapedStory.content); if (hasImages) { try { console.log(`Processing embedded images for story: ${createdStory.id}`); const imageProcessUrl = `http://backend:8080/api/stories/${createdStory.id}/process-content-images`; const imageProcessResponse = await fetch(imageProcessUrl, { method: 'POST', headers: { 'Authorization': authorization, 'Content-Type': 'application/json', }, body: JSON.stringify({ htmlContent: scrapedStory.content }), }); if (imageProcessResponse.ok) { const imageResult = await imageProcessResponse.json(); if (imageResult.hasWarnings && imageResult.warnings) { imageProcessingWarnings = imageResult.warnings; console.log(`Image processing completed with warnings for story ${createdStory.id}:`, imageResult.warnings); } else { console.log(`Image processing completed successfully for story ${createdStory.id}. Downloaded ${imageResult.downloadedImages?.length || 0} images.`); } // Update story content with processed images if (imageResult.processedContent && imageResult.processedContent !== scrapedStory.content) { const updateUrl = `http://backend:8080/api/stories/${createdStory.id}`; const updateResponse = await fetch(updateUrl, { method: 'PUT', headers: { 'Authorization': authorization, 'Content-Type': 'application/json', }, body: JSON.stringify({ contentHtml: imageResult.processedContent }), }); if (!updateResponse.ok) { console.warn(`Failed to update story content after image processing for ${createdStory.id}`); imageProcessingWarnings.push('Failed to update story content with processed images'); } } } else { console.warn(`Image processing failed for story ${createdStory.id}:`, imageProcessResponse.status); imageProcessingWarnings.push('Image processing failed'); } } catch (error) { console.error(`Error processing images for story ${createdStory.id}:`, error); imageProcessingWarnings.push(`Image processing error: ${error instanceof Error ? error.message : 'Unknown error'}`); } } results.push({ url: trimmedUrl, status: 'imported', title: scrapedStory.title, author: scrapedStory.author, storyId: createdStory.id }); importedCount++; console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})${hasImages ? ` with ${imageProcessingWarnings.length > 0 ? 'warnings' : 'successful image processing'}` : ''}`); // Send progress update for successful import let progressMessage = `Imported "${scrapedStory.title}" by ${scrapedStory.author}`; if (hasImages) { progressMessage += imageProcessingWarnings.length > 0 ? ' (with image warnings)' : ' (with images)'; } await sendProgressUpdate(sessionId, { type: 'progress', current: i + 1, total: urls.length, message: progressMessage, url: trimmedUrl, title: scrapedStory.title, author: scrapedStory.author, hasImages: hasImages, imageWarnings: imageProcessingWarnings }); } catch (error) { console.error(`Failed to create story for ${trimmedUrl}:`, error); let errorMessage = 'Failed to create story'; if (error instanceof Error) { errorMessage = error.message; } results.push({ url: trimmedUrl, status: 'error', error: errorMessage, title: scrapedStory.title, author: scrapedStory.author }); errorCount++; } } catch (error) { console.error(`Error processing URL ${url}:`, error); let errorMessage = 'Unknown error'; if (error instanceof Error) { errorMessage = error.message; } results.push({ url: url, status: 'error', error: errorMessage }); errorCount++; } } // Send completion notification await sendProgressUpdate(sessionId, { type: 'completed', current: urls.length, total: urls.length, message: `Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`, results: results, summary: { total: urls.length, imported: importedCount, skipped: skippedCount, errors: errorCount } }); console.log(`Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`); // Trigger Solr reindex if any stories were imported if (importedCount > 0) { try { console.log('Triggering Solr reindex after bulk import...'); const reindexUrl = `http://backend:8080/api/admin/search/solr/reindex`; const reindexResponse = await fetch(reindexUrl, { method: 'POST', headers: { 'Authorization': authorization, 'Content-Type': 'application/json', }, }); if (reindexResponse.ok) { const reindexResult = await reindexResponse.json(); console.log('Solr reindex completed:', reindexResult); } else { console.warn('Solr reindex failed:', reindexResponse.status); } } catch (error) { console.warn('Failed to trigger Solr reindex:', error); // Don't fail the whole request if reindex fails } } } // Background processing function async function processBulkImport( urls: string[], combineIntoOne: boolean, sessionId: string, authorization: string ) { try { // Dynamic imports to prevent client-side bundling const { StoryScraper } = await import('@/lib/scraper/scraper'); const scraper = new StoryScraper(); console.log(`Starting bulk scraping for ${urls.length} URLs${combineIntoOne ? ' (combine mode)' : ''}`); console.log(`Session ID: ${sessionId}`); // Quick test to verify backend connectivity try { console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`); const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, { method: 'GET', headers: { 'Authorization': authorization, 'Content-Type': 'application/json', }, }); console.log(`Backend test response status: ${testResponse.status}`); } catch (error) { console.error(`Backend connectivity test failed:`, error); } // Handle combined mode if (combineIntoOne) { await processCombinedMode(urls, sessionId, authorization, scraper); } else { // Normal individual processing mode await processIndividualMode(urls, sessionId, authorization, scraper); } } catch (error) { console.error('Background bulk import error:', error); await sendProgressUpdate(sessionId, { type: 'error', current: 0, total: urls.length, message: 'Bulk import failed due to an error', error: error instanceof Error ? error.message : 'Unknown error' }); } } export async function POST(request: NextRequest) { try { // Check for authentication const authorization = request.headers.get('authorization'); if (!authorization) { return NextResponse.json( { error: 'Authentication required for bulk import' }, { status: 401 } ); } const body = await request.json(); const { urls, combineIntoOne = false, sessionId } = body as BulkImportRequest; if (!urls || !Array.isArray(urls) || urls.length === 0) { return NextResponse.json( { error: 'URLs array is required and must not be empty' }, { status: 400 } ); } if (urls.length > 200) { return NextResponse.json( { error: 'Maximum 200 URLs allowed per bulk import' }, { status: 400 } ); } if (!sessionId) { return NextResponse.json( { error: 'Session ID is required for progress tracking' }, { status: 400 } ); } // Start the background processing processBulkImport(urls, combineIntoOne, sessionId, authorization).catch(error => { console.error('Failed to start background processing:', error); }); // Return immediately with session info return NextResponse.json({ message: 'Bulk import started', sessionId: sessionId, totalUrls: urls.length, combineMode: combineIntoOne }); } catch (error) { console.error('Bulk import initialization error:', error); if (error instanceof Error) { return NextResponse.json( { error: `Bulk import failed to start: ${error.message}` }, { status: 500 } ); } return NextResponse.json( { error: 'Bulk import failed to start due to an unknown error' }, { status: 500 } ); } }