636 lines
20 KiB
TypeScript
636 lines
20 KiB
TypeScript
import { NextRequest, NextResponse } from 'next/server';
|
|
|
|
// Configure route timeout for long-running scraping operations
|
|
export const maxDuration = 900; // 15 minutes (900 seconds)
|
|
|
|
// Import progress tracking helper
|
|
import { sendProgressUpdate } from '../../../lib/progress';
|
|
|
|
interface BulkImportRequest {
|
|
urls: string[];
|
|
combineIntoOne?: boolean;
|
|
sessionId?: string; // For progress tracking
|
|
}
|
|
|
|
interface ImportResult {
|
|
url: string;
|
|
status: 'imported' | 'skipped' | 'error';
|
|
reason?: string;
|
|
title?: string;
|
|
author?: string;
|
|
error?: string;
|
|
storyId?: string;
|
|
}
|
|
|
|
interface BulkImportResponse {
|
|
results: ImportResult[];
|
|
summary: {
|
|
total: number;
|
|
imported: number;
|
|
skipped: number;
|
|
errors: number;
|
|
};
|
|
combinedStory?: {
|
|
title: string;
|
|
author: string;
|
|
content: string;
|
|
summary?: string;
|
|
sourceUrl: string;
|
|
tags?: string[];
|
|
};
|
|
}
|
|
|
|
// Background processing function for combined mode
|
|
async function processCombinedMode(
|
|
urls: string[],
|
|
sessionId: string,
|
|
authorization: string,
|
|
scraper: any
|
|
) {
|
|
const results: ImportResult[] = [];
|
|
let importedCount = 0;
|
|
let errorCount = 0;
|
|
|
|
const combinedContent: string[] = [];
|
|
let baseTitle = '';
|
|
let baseAuthor = '';
|
|
let baseSummary = '';
|
|
let baseSourceUrl = '';
|
|
const combinedTags = new Set<string>();
|
|
let totalWordCount = 0;
|
|
|
|
// Send initial progress update
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'progress',
|
|
current: 0,
|
|
total: urls.length,
|
|
message: `Starting to scrape ${urls.length} URLs for combining...`,
|
|
totalWordCount: 0
|
|
});
|
|
|
|
for (let i = 0; i < urls.length; i++) {
|
|
const url = urls[i];
|
|
console.log(`Scraping URL ${i + 1}/${urls.length} for combine: ${url}`);
|
|
|
|
// Send progress update
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'progress',
|
|
current: i,
|
|
total: urls.length,
|
|
message: `Scraping URL ${i + 1} of ${urls.length}...`,
|
|
url: url,
|
|
totalWordCount
|
|
});
|
|
|
|
try {
|
|
const trimmedUrl = url.trim();
|
|
if (!trimmedUrl) {
|
|
results.push({
|
|
url: url || 'Empty URL',
|
|
status: 'error',
|
|
error: 'Empty URL in combined mode'
|
|
});
|
|
errorCount++;
|
|
continue;
|
|
}
|
|
|
|
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
|
|
|
// Check if we got content - this is required for combined mode
|
|
if (!scrapedStory.content || scrapedStory.content.trim() === '') {
|
|
results.push({
|
|
url: trimmedUrl,
|
|
status: 'error',
|
|
error: 'No content found - required for combined mode'
|
|
});
|
|
errorCount++;
|
|
continue;
|
|
}
|
|
|
|
// Use first URL for base metadata (title can be empty for combined mode)
|
|
if (i === 0) {
|
|
baseTitle = scrapedStory.title || 'Combined Story';
|
|
baseAuthor = scrapedStory.author || 'Unknown Author';
|
|
baseSummary = scrapedStory.summary || '';
|
|
baseSourceUrl = trimmedUrl;
|
|
}
|
|
|
|
// Add content with URL separator
|
|
combinedContent.push(`<!-- Content from: ${trimmedUrl} -->`);
|
|
if (scrapedStory.title && i > 0) {
|
|
combinedContent.push(`<h2>${scrapedStory.title}</h2>`);
|
|
}
|
|
combinedContent.push(scrapedStory.content);
|
|
combinedContent.push('<hr/>'); // Visual separator between parts
|
|
|
|
// Calculate word count for this story
|
|
const textContent = scrapedStory.content.replace(/<[^>]*>/g, ''); // Strip HTML
|
|
const wordCount = textContent.split(/\s+/).filter((word: string) => word.length > 0).length;
|
|
totalWordCount += wordCount;
|
|
|
|
// Collect tags from all stories
|
|
if (scrapedStory.tags) {
|
|
scrapedStory.tags.forEach((tag: string) => combinedTags.add(tag));
|
|
}
|
|
|
|
results.push({
|
|
url: trimmedUrl,
|
|
status: 'imported',
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author
|
|
});
|
|
importedCount++;
|
|
|
|
// Send progress update with word count
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'progress',
|
|
current: i + 1,
|
|
total: urls.length,
|
|
message: `Scraped "${scrapedStory.title}" (${wordCount.toLocaleString()} words)`,
|
|
url: trimmedUrl,
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author,
|
|
wordCount: wordCount,
|
|
totalWordCount: totalWordCount
|
|
});
|
|
|
|
} catch (error) {
|
|
console.error(`Error processing URL ${url} in combined mode:`, error);
|
|
results.push({
|
|
url: url,
|
|
status: 'error',
|
|
error: error instanceof Error ? error.message : 'Unknown error'
|
|
});
|
|
errorCount++;
|
|
}
|
|
}
|
|
|
|
// If we have any errors, fail the entire combined operation
|
|
if (errorCount > 0) {
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'error',
|
|
current: urls.length,
|
|
total: urls.length,
|
|
message: 'Combined mode failed: some URLs could not be processed',
|
|
error: `${errorCount} URLs failed to process`
|
|
});
|
|
return;
|
|
}
|
|
|
|
// Check content size to prevent response size issues
|
|
const combinedContentString = combinedContent.join('\n');
|
|
const contentSizeInMB = new Blob([combinedContentString]).size / (1024 * 1024);
|
|
|
|
console.log(`Combined content size: ${contentSizeInMB.toFixed(2)} MB`);
|
|
console.log(`Combined content character length: ${combinedContentString.length}`);
|
|
console.log(`Combined content parts count: ${combinedContent.length}`);
|
|
|
|
// Handle content truncation if needed
|
|
let finalContent = contentSizeInMB > 10 ?
|
|
combinedContentString.substring(0, Math.floor(combinedContentString.length * (10 / contentSizeInMB))) + '\n\n<!-- Content truncated due to size limit -->' :
|
|
combinedContentString;
|
|
|
|
let finalSummary = contentSizeInMB > 10 ? baseSummary + ' (Content truncated due to size limit)' : baseSummary;
|
|
|
|
// Check if combined content has images and mark for processing
|
|
const hasImages = /<img[^>]+src=['"'][^'"']*['"][^>]*>/i.test(finalContent);
|
|
if (hasImages) {
|
|
finalSummary += ' (Contains embedded images - will be processed after story creation)';
|
|
console.log(`Combined story contains embedded images - will need processing after creation`);
|
|
}
|
|
|
|
// Return the combined story data via progress update
|
|
const combinedStory = {
|
|
title: baseTitle,
|
|
author: baseAuthor,
|
|
content: finalContent,
|
|
summary: finalSummary,
|
|
sourceUrl: baseSourceUrl,
|
|
tags: Array.from(combinedTags),
|
|
hasImages: hasImages
|
|
};
|
|
|
|
// Send completion notification for combine mode
|
|
let completionMessage = `Combined scraping completed: ${totalWordCount.toLocaleString()} words from ${importedCount} stories`;
|
|
if (hasImages) {
|
|
completionMessage += ` (embedded images will be processed when story is created)`;
|
|
}
|
|
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'completed',
|
|
current: urls.length,
|
|
total: urls.length,
|
|
message: completionMessage,
|
|
totalWordCount: totalWordCount,
|
|
combinedStory: combinedStory
|
|
});
|
|
|
|
console.log(`Combined scraping completed: ${importedCount} URLs combined into one story`);
|
|
}
|
|
|
|
// Background processing function for individual mode
|
|
async function processIndividualMode(
|
|
urls: string[],
|
|
sessionId: string,
|
|
authorization: string,
|
|
scraper: any
|
|
) {
|
|
const results: ImportResult[] = [];
|
|
let importedCount = 0;
|
|
let skippedCount = 0;
|
|
let errorCount = 0;
|
|
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'progress',
|
|
current: 0,
|
|
total: urls.length,
|
|
message: `Starting to import ${urls.length} URLs individually...`
|
|
});
|
|
|
|
for (let i = 0; i < urls.length; i++) {
|
|
const url = urls[i];
|
|
console.log(`Processing URL ${i + 1}/${urls.length}: ${url}`);
|
|
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'progress',
|
|
current: i,
|
|
total: urls.length,
|
|
message: `Processing URL ${i + 1} of ${urls.length}...`,
|
|
url: url
|
|
});
|
|
|
|
try {
|
|
// Validate URL format
|
|
if (!url || typeof url !== 'string' || url.trim() === '') {
|
|
results.push({
|
|
url: url || 'Empty URL',
|
|
status: 'error',
|
|
error: 'Invalid URL format'
|
|
});
|
|
errorCount++;
|
|
continue;
|
|
}
|
|
|
|
const trimmedUrl = url.trim();
|
|
|
|
// Scrape the story
|
|
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
|
|
|
// Validate required fields
|
|
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
|
|
const missingFields = [];
|
|
if (!scrapedStory.title) missingFields.push('title');
|
|
if (!scrapedStory.author) missingFields.push('author');
|
|
if (!scrapedStory.content) missingFields.push('content');
|
|
|
|
results.push({
|
|
url: trimmedUrl,
|
|
status: 'skipped',
|
|
reason: `Missing required fields: ${missingFields.join(', ')}`,
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author
|
|
});
|
|
skippedCount++;
|
|
continue;
|
|
}
|
|
|
|
// Check for duplicates using query parameters
|
|
try {
|
|
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
|
|
const params = new URLSearchParams({
|
|
title: scrapedStory.title,
|
|
authorName: scrapedStory.author
|
|
});
|
|
|
|
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Authorization': authorization,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
});
|
|
|
|
if (duplicateCheckResponse.ok) {
|
|
const duplicateResult = await duplicateCheckResponse.json();
|
|
if (duplicateResult.hasDuplicates) {
|
|
results.push({
|
|
url: trimmedUrl,
|
|
status: 'skipped',
|
|
reason: `Duplicate story found (${duplicateResult.count} existing)`,
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author
|
|
});
|
|
skippedCount++;
|
|
continue;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.warn('Duplicate check failed:', error);
|
|
// Continue with import if duplicate check fails
|
|
}
|
|
|
|
// Create the story
|
|
try {
|
|
const storyData = {
|
|
title: scrapedStory.title,
|
|
summary: scrapedStory.summary || undefined,
|
|
contentHtml: scrapedStory.content,
|
|
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
|
|
authorName: scrapedStory.author,
|
|
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
|
|
};
|
|
|
|
const createUrl = `http://backend:8080/api/stories`;
|
|
const createResponse = await fetch(createUrl, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Authorization': authorization,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify(storyData),
|
|
});
|
|
|
|
if (!createResponse.ok) {
|
|
const errorData = await createResponse.json();
|
|
throw new Error(errorData.message || 'Failed to create story');
|
|
}
|
|
|
|
const createdStory = await createResponse.json();
|
|
|
|
// Process embedded images if content contains images
|
|
let imageProcessingWarnings: string[] = [];
|
|
const hasImages = /<img[^>]+src=['"'][^'"']*['"][^>]*>/i.test(scrapedStory.content);
|
|
|
|
if (hasImages) {
|
|
try {
|
|
console.log(`Processing embedded images for story: ${createdStory.id}`);
|
|
const imageProcessUrl = `http://backend:8080/api/stories/${createdStory.id}/process-content-images`;
|
|
const imageProcessResponse = await fetch(imageProcessUrl, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Authorization': authorization,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({ htmlContent: scrapedStory.content }),
|
|
});
|
|
|
|
if (imageProcessResponse.ok) {
|
|
const imageResult = await imageProcessResponse.json();
|
|
if (imageResult.hasWarnings && imageResult.warnings) {
|
|
imageProcessingWarnings = imageResult.warnings;
|
|
console.log(`Image processing completed with warnings for story ${createdStory.id}:`, imageResult.warnings);
|
|
} else {
|
|
console.log(`Image processing completed successfully for story ${createdStory.id}. Downloaded ${imageResult.downloadedImages?.length || 0} images.`);
|
|
}
|
|
|
|
// Update story content with processed images
|
|
if (imageResult.processedContent && imageResult.processedContent !== scrapedStory.content) {
|
|
const updateUrl = `http://backend:8080/api/stories/${createdStory.id}`;
|
|
const updateResponse = await fetch(updateUrl, {
|
|
method: 'PUT',
|
|
headers: {
|
|
'Authorization': authorization,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({
|
|
contentHtml: imageResult.processedContent
|
|
}),
|
|
});
|
|
|
|
if (!updateResponse.ok) {
|
|
console.warn(`Failed to update story content after image processing for ${createdStory.id}`);
|
|
imageProcessingWarnings.push('Failed to update story content with processed images');
|
|
}
|
|
}
|
|
} else {
|
|
console.warn(`Image processing failed for story ${createdStory.id}:`, imageProcessResponse.status);
|
|
imageProcessingWarnings.push('Image processing failed');
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error processing images for story ${createdStory.id}:`, error);
|
|
imageProcessingWarnings.push(`Image processing error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
}
|
|
}
|
|
|
|
results.push({
|
|
url: trimmedUrl,
|
|
status: 'imported',
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author,
|
|
storyId: createdStory.id
|
|
});
|
|
importedCount++;
|
|
|
|
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})${hasImages ? ` with ${imageProcessingWarnings.length > 0 ? 'warnings' : 'successful image processing'}` : ''}`);
|
|
|
|
// Send progress update for successful import
|
|
let progressMessage = `Imported "${scrapedStory.title}" by ${scrapedStory.author}`;
|
|
if (hasImages) {
|
|
progressMessage += imageProcessingWarnings.length > 0 ? ' (with image warnings)' : ' (with images)';
|
|
}
|
|
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'progress',
|
|
current: i + 1,
|
|
total: urls.length,
|
|
message: progressMessage,
|
|
url: trimmedUrl,
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author,
|
|
hasImages: hasImages,
|
|
imageWarnings: imageProcessingWarnings
|
|
});
|
|
|
|
} catch (error) {
|
|
console.error(`Failed to create story for ${trimmedUrl}:`, error);
|
|
|
|
let errorMessage = 'Failed to create story';
|
|
if (error instanceof Error) {
|
|
errorMessage = error.message;
|
|
}
|
|
|
|
results.push({
|
|
url: trimmedUrl,
|
|
status: 'error',
|
|
error: errorMessage,
|
|
title: scrapedStory.title,
|
|
author: scrapedStory.author
|
|
});
|
|
errorCount++;
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error(`Error processing URL ${url}:`, error);
|
|
|
|
let errorMessage = 'Unknown error';
|
|
if (error instanceof Error) {
|
|
errorMessage = error.message;
|
|
}
|
|
|
|
results.push({
|
|
url: url,
|
|
status: 'error',
|
|
error: errorMessage
|
|
});
|
|
errorCount++;
|
|
}
|
|
}
|
|
|
|
// Send completion notification
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'completed',
|
|
current: urls.length,
|
|
total: urls.length,
|
|
message: `Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`,
|
|
results: results,
|
|
summary: {
|
|
total: urls.length,
|
|
imported: importedCount,
|
|
skipped: skippedCount,
|
|
errors: errorCount
|
|
}
|
|
});
|
|
|
|
console.log(`Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`);
|
|
|
|
// Trigger Solr reindex if any stories were imported
|
|
if (importedCount > 0) {
|
|
try {
|
|
console.log('Triggering Solr reindex after bulk import...');
|
|
const reindexUrl = `http://backend:8080/api/admin/search/solr/reindex`;
|
|
const reindexResponse = await fetch(reindexUrl, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Authorization': authorization,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
});
|
|
|
|
if (reindexResponse.ok) {
|
|
const reindexResult = await reindexResponse.json();
|
|
console.log('Solr reindex completed:', reindexResult);
|
|
} else {
|
|
console.warn('Solr reindex failed:', reindexResponse.status);
|
|
}
|
|
} catch (error) {
|
|
console.warn('Failed to trigger Solr reindex:', error);
|
|
// Don't fail the whole request if reindex fails
|
|
}
|
|
}
|
|
}
|
|
|
|
// Background processing function
|
|
async function processBulkImport(
|
|
urls: string[],
|
|
combineIntoOne: boolean,
|
|
sessionId: string,
|
|
authorization: string
|
|
) {
|
|
try {
|
|
// Dynamic imports to prevent client-side bundling
|
|
const { StoryScraper } = await import('@/lib/scraper/scraper');
|
|
|
|
const scraper = new StoryScraper();
|
|
|
|
console.log(`Starting bulk scraping for ${urls.length} URLs${combineIntoOne ? ' (combine mode)' : ''}`);
|
|
console.log(`Session ID: ${sessionId}`);
|
|
|
|
// Quick test to verify backend connectivity
|
|
try {
|
|
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
|
|
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Authorization': authorization,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
});
|
|
console.log(`Backend test response status: ${testResponse.status}`);
|
|
} catch (error) {
|
|
console.error(`Backend connectivity test failed:`, error);
|
|
}
|
|
|
|
// Handle combined mode
|
|
if (combineIntoOne) {
|
|
await processCombinedMode(urls, sessionId, authorization, scraper);
|
|
} else {
|
|
// Normal individual processing mode
|
|
await processIndividualMode(urls, sessionId, authorization, scraper);
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error('Background bulk import error:', error);
|
|
await sendProgressUpdate(sessionId, {
|
|
type: 'error',
|
|
current: 0,
|
|
total: urls.length,
|
|
message: 'Bulk import failed due to an error',
|
|
error: error instanceof Error ? error.message : 'Unknown error'
|
|
});
|
|
}
|
|
}
|
|
|
|
export async function POST(request: NextRequest) {
|
|
try {
|
|
// Check for authentication
|
|
const authorization = request.headers.get('authorization');
|
|
if (!authorization) {
|
|
return NextResponse.json(
|
|
{ error: 'Authentication required for bulk import' },
|
|
{ status: 401 }
|
|
);
|
|
}
|
|
|
|
const body = await request.json();
|
|
const { urls, combineIntoOne = false, sessionId } = body as BulkImportRequest;
|
|
|
|
if (!urls || !Array.isArray(urls) || urls.length === 0) {
|
|
return NextResponse.json(
|
|
{ error: 'URLs array is required and must not be empty' },
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
if (urls.length > 200) {
|
|
return NextResponse.json(
|
|
{ error: 'Maximum 200 URLs allowed per bulk import' },
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
if (!sessionId) {
|
|
return NextResponse.json(
|
|
{ error: 'Session ID is required for progress tracking' },
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
// Start the background processing
|
|
processBulkImport(urls, combineIntoOne, sessionId, authorization).catch(error => {
|
|
console.error('Failed to start background processing:', error);
|
|
});
|
|
|
|
// Return immediately with session info
|
|
return NextResponse.json({
|
|
message: 'Bulk import started',
|
|
sessionId: sessionId,
|
|
totalUrls: urls.length,
|
|
combineMode: combineIntoOne
|
|
});
|
|
|
|
} catch (error) {
|
|
console.error('Bulk import initialization error:', error);
|
|
|
|
if (error instanceof Error) {
|
|
return NextResponse.json(
|
|
{ error: `Bulk import failed to start: ${error.message}` },
|
|
{ status: 500 }
|
|
);
|
|
}
|
|
|
|
return NextResponse.json(
|
|
{ error: 'Bulk import failed to start due to an unknown error' },
|
|
{ status: 500 }
|
|
);
|
|
}
|
|
} |