Various improvements & Epub support
This commit is contained in:
93
frontend/src/app/scrape/bulk/progress/route.ts
Normal file
93
frontend/src/app/scrape/bulk/progress/route.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { NextRequest } from 'next/server';
|
||||
|
||||
// Configure route timeout for long-running progress streams
|
||||
export const maxDuration = 900; // 15 minutes (900 seconds)
|
||||
|
||||
interface ProgressUpdate {
|
||||
type: 'progress' | 'completed' | 'error';
|
||||
current: number;
|
||||
total: number;
|
||||
message: string;
|
||||
url?: string;
|
||||
title?: string;
|
||||
author?: string;
|
||||
wordCount?: number;
|
||||
totalWordCount?: number;
|
||||
error?: string;
|
||||
combinedStory?: any;
|
||||
results?: any[];
|
||||
summary?: any;
|
||||
}
|
||||
|
||||
// Global progress storage (in production, use Redis or database)
|
||||
const progressStore = new Map<string, ProgressUpdate[]>();
|
||||
|
||||
export async function GET(request: NextRequest) {
|
||||
const searchParams = request.nextUrl.searchParams;
|
||||
const sessionId = searchParams.get('sessionId');
|
||||
|
||||
if (!sessionId) {
|
||||
return new Response('Session ID required', { status: 400 });
|
||||
}
|
||||
|
||||
// Set up Server-Sent Events
|
||||
const stream = new ReadableStream({
|
||||
start(controller) {
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
// Send initial connection message
|
||||
const data = `data: ${JSON.stringify({ type: 'connected', sessionId })}\n\n`;
|
||||
controller.enqueue(encoder.encode(data));
|
||||
|
||||
// Check for progress updates every 500ms
|
||||
const interval = setInterval(() => {
|
||||
const updates = progressStore.get(sessionId);
|
||||
if (updates && updates.length > 0) {
|
||||
// Send all pending updates
|
||||
updates.forEach(update => {
|
||||
const data = `data: ${JSON.stringify(update)}\n\n`;
|
||||
controller.enqueue(encoder.encode(data));
|
||||
});
|
||||
|
||||
// Clear sent updates
|
||||
progressStore.delete(sessionId);
|
||||
|
||||
// If this was a completion or error, close the stream
|
||||
const lastUpdate = updates[updates.length - 1];
|
||||
if (lastUpdate.type === 'completed' || lastUpdate.type === 'error') {
|
||||
clearInterval(interval);
|
||||
controller.close();
|
||||
}
|
||||
}
|
||||
}, 500);
|
||||
|
||||
// Cleanup after timeout
|
||||
setTimeout(() => {
|
||||
clearInterval(interval);
|
||||
progressStore.delete(sessionId);
|
||||
controller.close();
|
||||
}, 900000); // 15 minutes
|
||||
}
|
||||
});
|
||||
|
||||
return new Response(stream, {
|
||||
headers: {
|
||||
'Content-Type': 'text/event-stream',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
'Access-Control-Allow-Headers': 'Cache-Control',
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// Helper function for other routes to send progress updates
|
||||
export function sendProgressUpdate(sessionId: string, update: ProgressUpdate) {
|
||||
if (!progressStore.has(sessionId)) {
|
||||
progressStore.set(sessionId, []);
|
||||
}
|
||||
progressStore.get(sessionId)!.push(update);
|
||||
}
|
||||
|
||||
// Export the helper for other modules to use
|
||||
export { progressStore };
|
||||
@@ -1,7 +1,23 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
// Configure route timeout for long-running scraping operations
|
||||
export const maxDuration = 900; // 15 minutes (900 seconds)
|
||||
|
||||
// Import progress tracking helper
|
||||
async function sendProgressUpdate(sessionId: string, update: any) {
|
||||
try {
|
||||
// Dynamic import to avoid circular dependency
|
||||
const { sendProgressUpdate: sendUpdate } = await import('./progress/route');
|
||||
sendUpdate(sessionId, update);
|
||||
} catch (error) {
|
||||
console.warn('Failed to send progress update:', error);
|
||||
}
|
||||
}
|
||||
|
||||
interface BulkImportRequest {
|
||||
urls: string[];
|
||||
combineIntoOne?: boolean;
|
||||
sessionId?: string; // For progress tracking
|
||||
}
|
||||
|
||||
interface ImportResult {
|
||||
@@ -22,52 +38,430 @@ interface BulkImportResponse {
|
||||
skipped: number;
|
||||
errors: number;
|
||||
};
|
||||
combinedStory?: {
|
||||
title: string;
|
||||
author: string;
|
||||
content: string;
|
||||
summary?: string;
|
||||
sourceUrl: string;
|
||||
tags?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
// Background processing function for combined mode
|
||||
async function processCombinedMode(
|
||||
urls: string[],
|
||||
sessionId: string,
|
||||
authorization: string,
|
||||
scraper: any
|
||||
) {
|
||||
const results: ImportResult[] = [];
|
||||
let importedCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
const combinedContent: string[] = [];
|
||||
let baseTitle = '';
|
||||
let baseAuthor = '';
|
||||
let baseSummary = '';
|
||||
let baseSourceUrl = '';
|
||||
const combinedTags = new Set<string>();
|
||||
let totalWordCount = 0;
|
||||
|
||||
// Send initial progress update
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'progress',
|
||||
current: 0,
|
||||
total: urls.length,
|
||||
message: `Starting to scrape ${urls.length} URLs for combining...`,
|
||||
totalWordCount: 0
|
||||
});
|
||||
|
||||
for (let i = 0; i < urls.length; i++) {
|
||||
const url = urls[i];
|
||||
console.log(`Scraping URL ${i + 1}/${urls.length} for combine: ${url}`);
|
||||
|
||||
// Send progress update
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'progress',
|
||||
current: i,
|
||||
total: urls.length,
|
||||
message: `Scraping URL ${i + 1} of ${urls.length}...`,
|
||||
url: url,
|
||||
totalWordCount
|
||||
});
|
||||
|
||||
try {
|
||||
const trimmedUrl = url.trim();
|
||||
if (!trimmedUrl) {
|
||||
results.push({
|
||||
url: url || 'Empty URL',
|
||||
status: 'error',
|
||||
error: 'Empty URL in combined mode'
|
||||
});
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
||||
|
||||
// Check if we got content - this is required for combined mode
|
||||
if (!scrapedStory.content || scrapedStory.content.trim() === '') {
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'error',
|
||||
error: 'No content found - required for combined mode'
|
||||
});
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use first URL for base metadata (title can be empty for combined mode)
|
||||
if (i === 0) {
|
||||
baseTitle = scrapedStory.title || 'Combined Story';
|
||||
baseAuthor = scrapedStory.author || 'Unknown Author';
|
||||
baseSummary = scrapedStory.summary || '';
|
||||
baseSourceUrl = trimmedUrl;
|
||||
}
|
||||
|
||||
// Add content with URL separator
|
||||
combinedContent.push(`<!-- Content from: ${trimmedUrl} -->`);
|
||||
if (scrapedStory.title && i > 0) {
|
||||
combinedContent.push(`<h2>${scrapedStory.title}</h2>`);
|
||||
}
|
||||
combinedContent.push(scrapedStory.content);
|
||||
combinedContent.push('<hr/>'); // Visual separator between parts
|
||||
|
||||
// Calculate word count for this story
|
||||
const textContent = scrapedStory.content.replace(/<[^>]*>/g, ''); // Strip HTML
|
||||
const wordCount = textContent.split(/\s+/).filter((word: string) => word.length > 0).length;
|
||||
totalWordCount += wordCount;
|
||||
|
||||
// Collect tags from all stories
|
||||
if (scrapedStory.tags) {
|
||||
scrapedStory.tags.forEach((tag: string) => combinedTags.add(tag));
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'imported',
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
importedCount++;
|
||||
|
||||
// Send progress update with word count
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'progress',
|
||||
current: i + 1,
|
||||
total: urls.length,
|
||||
message: `Scraped "${scrapedStory.title}" (${wordCount.toLocaleString()} words)`,
|
||||
url: trimmedUrl,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author,
|
||||
wordCount: wordCount,
|
||||
totalWordCount: totalWordCount
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing URL ${url} in combined mode:`, error);
|
||||
results.push({
|
||||
url: url,
|
||||
status: 'error',
|
||||
error: error instanceof Error ? error.message : 'Unknown error'
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// If we have any errors, fail the entire combined operation
|
||||
if (errorCount > 0) {
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'error',
|
||||
current: urls.length,
|
||||
total: urls.length,
|
||||
message: 'Combined mode failed: some URLs could not be processed',
|
||||
error: `${errorCount} URLs failed to process`
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Check content size to prevent response size issues
|
||||
const combinedContentString = combinedContent.join('\n');
|
||||
const contentSizeInMB = new Blob([combinedContentString]).size / (1024 * 1024);
|
||||
|
||||
console.log(`Combined content size: ${contentSizeInMB.toFixed(2)} MB`);
|
||||
console.log(`Combined content character length: ${combinedContentString.length}`);
|
||||
console.log(`Combined content parts count: ${combinedContent.length}`);
|
||||
|
||||
// Return the combined story data via progress update
|
||||
const combinedStory = {
|
||||
title: baseTitle,
|
||||
author: baseAuthor,
|
||||
content: contentSizeInMB > 10 ?
|
||||
combinedContentString.substring(0, Math.floor(combinedContentString.length * (10 / contentSizeInMB))) + '\n\n<!-- Content truncated due to size limit -->' :
|
||||
combinedContentString,
|
||||
summary: contentSizeInMB > 10 ? baseSummary + ' (Content truncated due to size limit)' : baseSummary,
|
||||
sourceUrl: baseSourceUrl,
|
||||
tags: Array.from(combinedTags)
|
||||
};
|
||||
|
||||
// Send completion notification for combine mode
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'completed',
|
||||
current: urls.length,
|
||||
total: urls.length,
|
||||
message: `Combined scraping completed: ${totalWordCount.toLocaleString()} words from ${importedCount} stories`,
|
||||
totalWordCount: totalWordCount,
|
||||
combinedStory: combinedStory
|
||||
});
|
||||
|
||||
console.log(`Combined scraping completed: ${importedCount} URLs combined into one story`);
|
||||
}
|
||||
|
||||
// Background processing function for individual mode
|
||||
async function processIndividualMode(
|
||||
urls: string[],
|
||||
sessionId: string,
|
||||
authorization: string,
|
||||
scraper: any
|
||||
) {
|
||||
const results: ImportResult[] = [];
|
||||
let importedCount = 0;
|
||||
let skippedCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'progress',
|
||||
current: 0,
|
||||
total: urls.length,
|
||||
message: `Starting to import ${urls.length} URLs individually...`
|
||||
});
|
||||
|
||||
for (let i = 0; i < urls.length; i++) {
|
||||
const url = urls[i];
|
||||
console.log(`Processing URL ${i + 1}/${urls.length}: ${url}`);
|
||||
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'progress',
|
||||
current: i,
|
||||
total: urls.length,
|
||||
message: `Processing URL ${i + 1} of ${urls.length}...`,
|
||||
url: url
|
||||
});
|
||||
|
||||
try {
|
||||
// Validate URL format
|
||||
if (!url || typeof url !== 'string' || url.trim() === '') {
|
||||
results.push({
|
||||
url: url || 'Empty URL',
|
||||
status: 'error',
|
||||
error: 'Invalid URL format'
|
||||
});
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const trimmedUrl = url.trim();
|
||||
|
||||
// Scrape the story
|
||||
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
||||
|
||||
// Validate required fields
|
||||
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
|
||||
const missingFields = [];
|
||||
if (!scrapedStory.title) missingFields.push('title');
|
||||
if (!scrapedStory.author) missingFields.push('author');
|
||||
if (!scrapedStory.content) missingFields.push('content');
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'skipped',
|
||||
reason: `Missing required fields: ${missingFields.join(', ')}`,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
skippedCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for duplicates using query parameters
|
||||
try {
|
||||
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
|
||||
const params = new URLSearchParams({
|
||||
title: scrapedStory.title,
|
||||
authorName: scrapedStory.author
|
||||
});
|
||||
|
||||
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (duplicateCheckResponse.ok) {
|
||||
const duplicateResult = await duplicateCheckResponse.json();
|
||||
if (duplicateResult.hasDuplicates) {
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'skipped',
|
||||
reason: `Duplicate story found (${duplicateResult.count} existing)`,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
skippedCount++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Duplicate check failed:', error);
|
||||
// Continue with import if duplicate check fails
|
||||
}
|
||||
|
||||
// Create the story
|
||||
try {
|
||||
const storyData = {
|
||||
title: scrapedStory.title,
|
||||
summary: scrapedStory.summary || undefined,
|
||||
contentHtml: scrapedStory.content,
|
||||
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
|
||||
authorName: scrapedStory.author,
|
||||
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
|
||||
};
|
||||
|
||||
const createUrl = `http://backend:8080/api/stories`;
|
||||
const createResponse = await fetch(createUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(storyData),
|
||||
});
|
||||
|
||||
if (!createResponse.ok) {
|
||||
const errorData = await createResponse.json();
|
||||
throw new Error(errorData.message || 'Failed to create story');
|
||||
}
|
||||
|
||||
const createdStory = await createResponse.json();
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'imported',
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author,
|
||||
storyId: createdStory.id
|
||||
});
|
||||
importedCount++;
|
||||
|
||||
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
|
||||
|
||||
// Send progress update for successful import
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'progress',
|
||||
current: i + 1,
|
||||
total: urls.length,
|
||||
message: `Imported "${scrapedStory.title}" by ${scrapedStory.author}`,
|
||||
url: trimmedUrl,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Failed to create story for ${trimmedUrl}:`, error);
|
||||
|
||||
let errorMessage = 'Failed to create story';
|
||||
if (error instanceof Error) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'error',
|
||||
error: errorMessage,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing URL ${url}:`, error);
|
||||
|
||||
let errorMessage = 'Unknown error';
|
||||
if (error instanceof Error) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: url,
|
||||
status: 'error',
|
||||
error: errorMessage
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Send completion notification
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'completed',
|
||||
current: urls.length,
|
||||
total: urls.length,
|
||||
message: `Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`,
|
||||
results: results,
|
||||
summary: {
|
||||
total: urls.length,
|
||||
imported: importedCount,
|
||||
skipped: skippedCount,
|
||||
errors: errorCount
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`);
|
||||
|
||||
// Trigger Typesense reindex if any stories were imported
|
||||
if (importedCount > 0) {
|
||||
try {
|
||||
console.log('Triggering Typesense reindex after bulk import...');
|
||||
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
|
||||
const reindexResponse = await fetch(reindexUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (reindexResponse.ok) {
|
||||
const reindexResult = await reindexResponse.json();
|
||||
console.log('Typesense reindex completed:', reindexResult);
|
||||
} else {
|
||||
console.warn('Typesense reindex failed:', reindexResponse.status);
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Failed to trigger Typesense reindex:', error);
|
||||
// Don't fail the whole request if reindex fails
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Background processing function
|
||||
async function processBulkImport(
|
||||
urls: string[],
|
||||
combineIntoOne: boolean,
|
||||
sessionId: string,
|
||||
authorization: string
|
||||
) {
|
||||
try {
|
||||
// Check for authentication
|
||||
const authorization = request.headers.get('authorization');
|
||||
if (!authorization) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Authentication required for bulk import' },
|
||||
{ status: 401 }
|
||||
);
|
||||
}
|
||||
|
||||
const body = await request.json();
|
||||
const { urls } = body as BulkImportRequest;
|
||||
|
||||
if (!urls || !Array.isArray(urls) || urls.length === 0) {
|
||||
return NextResponse.json(
|
||||
{ error: 'URLs array is required and must not be empty' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
if (urls.length > 50) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Maximum 50 URLs allowed per bulk import' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
// Dynamic imports to prevent client-side bundling
|
||||
const { StoryScraper } = await import('@/lib/scraper/scraper');
|
||||
|
||||
const scraper = new StoryScraper();
|
||||
const results: ImportResult[] = [];
|
||||
let importedCount = 0;
|
||||
let skippedCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
console.log(`Starting bulk scraping for ${urls.length} URLs`);
|
||||
console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
|
||||
|
||||
// For server-side API calls in Docker, use direct backend container URL
|
||||
// Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
|
||||
const serverSideApiBaseUrl = 'http://backend:8080/api';
|
||||
console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
|
||||
console.log(`Starting bulk scraping for ${urls.length} URLs${combineIntoOne ? ' (combine mode)' : ''}`);
|
||||
console.log(`Session ID: ${sessionId}`);
|
||||
|
||||
// Quick test to verify backend connectivity
|
||||
try {
|
||||
@@ -84,208 +478,86 @@ export async function POST(request: NextRequest) {
|
||||
console.error(`Backend connectivity test failed:`, error);
|
||||
}
|
||||
|
||||
for (const url of urls) {
|
||||
console.log(`Processing URL: ${url}`);
|
||||
|
||||
try {
|
||||
// Validate URL format
|
||||
if (!url || typeof url !== 'string' || url.trim() === '') {
|
||||
results.push({
|
||||
url: url || 'Empty URL',
|
||||
status: 'error',
|
||||
error: 'Invalid URL format'
|
||||
});
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const trimmedUrl = url.trim();
|
||||
|
||||
// Scrape the story
|
||||
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
||||
|
||||
// Validate required fields
|
||||
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
|
||||
const missingFields = [];
|
||||
if (!scrapedStory.title) missingFields.push('title');
|
||||
if (!scrapedStory.author) missingFields.push('author');
|
||||
if (!scrapedStory.content) missingFields.push('content');
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'skipped',
|
||||
reason: `Missing required fields: ${missingFields.join(', ')}`,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
skippedCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for duplicates using query parameters
|
||||
try {
|
||||
// Use hardcoded backend URL for container-to-container communication
|
||||
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
|
||||
console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
|
||||
const params = new URLSearchParams({
|
||||
title: scrapedStory.title,
|
||||
authorName: scrapedStory.author
|
||||
});
|
||||
|
||||
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (duplicateCheckResponse.ok) {
|
||||
const duplicateResult = await duplicateCheckResponse.json();
|
||||
if (duplicateResult.hasDuplicates) {
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'skipped',
|
||||
reason: `Duplicate story found (${duplicateResult.count} existing)`,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
skippedCount++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Duplicate check failed:', error);
|
||||
// Continue with import if duplicate check fails
|
||||
}
|
||||
|
||||
// Create the story
|
||||
try {
|
||||
const storyData = {
|
||||
title: scrapedStory.title,
|
||||
summary: scrapedStory.summary || undefined,
|
||||
contentHtml: scrapedStory.content,
|
||||
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
|
||||
authorName: scrapedStory.author,
|
||||
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
|
||||
};
|
||||
|
||||
// Use hardcoded backend URL for container-to-container communication
|
||||
const createUrl = `http://backend:8080/api/stories`;
|
||||
console.log(`Create story URL: ${createUrl}`);
|
||||
const createResponse = await fetch(createUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(storyData),
|
||||
});
|
||||
|
||||
if (!createResponse.ok) {
|
||||
const errorData = await createResponse.json();
|
||||
throw new Error(errorData.message || 'Failed to create story');
|
||||
}
|
||||
|
||||
const createdStory = await createResponse.json();
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'imported',
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author,
|
||||
storyId: createdStory.id
|
||||
});
|
||||
importedCount++;
|
||||
|
||||
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Failed to create story for ${trimmedUrl}:`, error);
|
||||
|
||||
let errorMessage = 'Failed to create story';
|
||||
if (error instanceof Error) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'error',
|
||||
error: errorMessage,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing URL ${url}:`, error);
|
||||
|
||||
let errorMessage = 'Unknown error';
|
||||
if (error instanceof Error) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: url,
|
||||
status: 'error',
|
||||
error: errorMessage
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
// Handle combined mode
|
||||
if (combineIntoOne) {
|
||||
await processCombinedMode(urls, sessionId, authorization, scraper);
|
||||
} else {
|
||||
// Normal individual processing mode
|
||||
await processIndividualMode(urls, sessionId, authorization, scraper);
|
||||
}
|
||||
|
||||
const response: BulkImportResponse = {
|
||||
results,
|
||||
summary: {
|
||||
total: urls.length,
|
||||
imported: importedCount,
|
||||
skipped: skippedCount,
|
||||
errors: errorCount
|
||||
}
|
||||
};
|
||||
|
||||
console.log(`Bulk import completed:`, response.summary);
|
||||
|
||||
// Trigger Typesense reindex if any stories were imported
|
||||
if (importedCount > 0) {
|
||||
try {
|
||||
console.log('Triggering Typesense reindex after bulk import...');
|
||||
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
|
||||
const reindexResponse = await fetch(reindexUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (reindexResponse.ok) {
|
||||
const reindexResult = await reindexResponse.json();
|
||||
console.log('Typesense reindex completed:', reindexResult);
|
||||
} else {
|
||||
console.warn('Typesense reindex failed:', reindexResponse.status);
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Failed to trigger Typesense reindex:', error);
|
||||
// Don't fail the whole request if reindex fails
|
||||
}
|
||||
}
|
||||
|
||||
return NextResponse.json(response);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Bulk import error:', error);
|
||||
console.error('Background bulk import error:', error);
|
||||
await sendProgressUpdate(sessionId, {
|
||||
type: 'error',
|
||||
current: 0,
|
||||
total: urls.length,
|
||||
message: 'Bulk import failed due to an error',
|
||||
error: error instanceof Error ? error.message : 'Unknown error'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
try {
|
||||
// Check for authentication
|
||||
const authorization = request.headers.get('authorization');
|
||||
if (!authorization) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Authentication required for bulk import' },
|
||||
{ status: 401 }
|
||||
);
|
||||
}
|
||||
|
||||
const body = await request.json();
|
||||
const { urls, combineIntoOne = false, sessionId } = body as BulkImportRequest;
|
||||
|
||||
if (!urls || !Array.isArray(urls) || urls.length === 0) {
|
||||
return NextResponse.json(
|
||||
{ error: 'URLs array is required and must not be empty' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
if (urls.length > 200) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Maximum 200 URLs allowed per bulk import' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
if (!sessionId) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Session ID is required for progress tracking' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
// Start the background processing
|
||||
processBulkImport(urls, combineIntoOne, sessionId, authorization).catch(error => {
|
||||
console.error('Failed to start background processing:', error);
|
||||
});
|
||||
|
||||
// Return immediately with session info
|
||||
return NextResponse.json({
|
||||
message: 'Bulk import started',
|
||||
sessionId: sessionId,
|
||||
totalUrls: urls.length,
|
||||
combineMode: combineIntoOne
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error('Bulk import initialization error:', error);
|
||||
|
||||
if (error instanceof Error) {
|
||||
return NextResponse.json(
|
||||
{ error: `Bulk import failed: ${error.message}` },
|
||||
{ error: `Bulk import failed to start: ${error.message}` },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.json(
|
||||
{ error: 'Bulk import failed due to an unknown error' },
|
||||
{ error: 'Bulk import failed to start due to an unknown error' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user