scraping and improvements
This commit is contained in:
292
frontend/src/app/scrape/bulk/route.ts
Normal file
292
frontend/src/app/scrape/bulk/route.ts
Normal file
@@ -0,0 +1,292 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
interface BulkImportRequest {
|
||||
urls: string[];
|
||||
}
|
||||
|
||||
interface ImportResult {
|
||||
url: string;
|
||||
status: 'imported' | 'skipped' | 'error';
|
||||
reason?: string;
|
||||
title?: string;
|
||||
author?: string;
|
||||
error?: string;
|
||||
storyId?: string;
|
||||
}
|
||||
|
||||
interface BulkImportResponse {
|
||||
results: ImportResult[];
|
||||
summary: {
|
||||
total: number;
|
||||
imported: number;
|
||||
skipped: number;
|
||||
errors: number;
|
||||
};
|
||||
}
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
try {
|
||||
// Check for authentication
|
||||
const authorization = request.headers.get('authorization');
|
||||
if (!authorization) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Authentication required for bulk import' },
|
||||
{ status: 401 }
|
||||
);
|
||||
}
|
||||
|
||||
const body = await request.json();
|
||||
const { urls } = body as BulkImportRequest;
|
||||
|
||||
if (!urls || !Array.isArray(urls) || urls.length === 0) {
|
||||
return NextResponse.json(
|
||||
{ error: 'URLs array is required and must not be empty' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
if (urls.length > 50) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Maximum 50 URLs allowed per bulk import' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
// Dynamic imports to prevent client-side bundling
|
||||
const { StoryScraper } = await import('@/lib/scraper/scraper');
|
||||
|
||||
const scraper = new StoryScraper();
|
||||
const results: ImportResult[] = [];
|
||||
let importedCount = 0;
|
||||
let skippedCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
console.log(`Starting bulk scraping for ${urls.length} URLs`);
|
||||
console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
|
||||
|
||||
// For server-side API calls in Docker, use direct backend container URL
|
||||
// Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
|
||||
const serverSideApiBaseUrl = 'http://backend:8080/api';
|
||||
console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
|
||||
|
||||
// Quick test to verify backend connectivity
|
||||
try {
|
||||
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
|
||||
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
console.log(`Backend test response status: ${testResponse.status}`);
|
||||
} catch (error) {
|
||||
console.error(`Backend connectivity test failed:`, error);
|
||||
}
|
||||
|
||||
for (const url of urls) {
|
||||
console.log(`Processing URL: ${url}`);
|
||||
|
||||
try {
|
||||
// Validate URL format
|
||||
if (!url || typeof url !== 'string' || url.trim() === '') {
|
||||
results.push({
|
||||
url: url || 'Empty URL',
|
||||
status: 'error',
|
||||
error: 'Invalid URL format'
|
||||
});
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const trimmedUrl = url.trim();
|
||||
|
||||
// Scrape the story
|
||||
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
||||
|
||||
// Validate required fields
|
||||
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
|
||||
const missingFields = [];
|
||||
if (!scrapedStory.title) missingFields.push('title');
|
||||
if (!scrapedStory.author) missingFields.push('author');
|
||||
if (!scrapedStory.content) missingFields.push('content');
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'skipped',
|
||||
reason: `Missing required fields: ${missingFields.join(', ')}`,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
skippedCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for duplicates using query parameters
|
||||
try {
|
||||
// Use hardcoded backend URL for container-to-container communication
|
||||
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
|
||||
console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
|
||||
const params = new URLSearchParams({
|
||||
title: scrapedStory.title,
|
||||
authorName: scrapedStory.author
|
||||
});
|
||||
|
||||
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (duplicateCheckResponse.ok) {
|
||||
const duplicateResult = await duplicateCheckResponse.json();
|
||||
if (duplicateResult.hasDuplicates) {
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'skipped',
|
||||
reason: `Duplicate story found (${duplicateResult.count} existing)`,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
skippedCount++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Duplicate check failed:', error);
|
||||
// Continue with import if duplicate check fails
|
||||
}
|
||||
|
||||
// Create the story
|
||||
try {
|
||||
const storyData = {
|
||||
title: scrapedStory.title,
|
||||
summary: scrapedStory.summary || undefined,
|
||||
contentHtml: scrapedStory.content,
|
||||
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
|
||||
authorName: scrapedStory.author,
|
||||
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
|
||||
};
|
||||
|
||||
// Use hardcoded backend URL for container-to-container communication
|
||||
const createUrl = `http://backend:8080/api/stories`;
|
||||
console.log(`Create story URL: ${createUrl}`);
|
||||
const createResponse = await fetch(createUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(storyData),
|
||||
});
|
||||
|
||||
if (!createResponse.ok) {
|
||||
const errorData = await createResponse.json();
|
||||
throw new Error(errorData.message || 'Failed to create story');
|
||||
}
|
||||
|
||||
const createdStory = await createResponse.json();
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'imported',
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author,
|
||||
storyId: createdStory.id
|
||||
});
|
||||
importedCount++;
|
||||
|
||||
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Failed to create story for ${trimmedUrl}:`, error);
|
||||
|
||||
let errorMessage = 'Failed to create story';
|
||||
if (error instanceof Error) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: trimmedUrl,
|
||||
status: 'error',
|
||||
error: errorMessage,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing URL ${url}:`, error);
|
||||
|
||||
let errorMessage = 'Unknown error';
|
||||
if (error instanceof Error) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
results.push({
|
||||
url: url,
|
||||
status: 'error',
|
||||
error: errorMessage
|
||||
});
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
const response: BulkImportResponse = {
|
||||
results,
|
||||
summary: {
|
||||
total: urls.length,
|
||||
imported: importedCount,
|
||||
skipped: skippedCount,
|
||||
errors: errorCount
|
||||
}
|
||||
};
|
||||
|
||||
console.log(`Bulk import completed:`, response.summary);
|
||||
|
||||
// Trigger Typesense reindex if any stories were imported
|
||||
if (importedCount > 0) {
|
||||
try {
|
||||
console.log('Triggering Typesense reindex after bulk import...');
|
||||
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
|
||||
const reindexResponse = await fetch(reindexUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': authorization,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (reindexResponse.ok) {
|
||||
const reindexResult = await reindexResponse.json();
|
||||
console.log('Typesense reindex completed:', reindexResult);
|
||||
} else {
|
||||
console.warn('Typesense reindex failed:', reindexResponse.status);
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Failed to trigger Typesense reindex:', error);
|
||||
// Don't fail the whole request if reindex fails
|
||||
}
|
||||
}
|
||||
|
||||
return NextResponse.json(response);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Bulk import error:', error);
|
||||
|
||||
if (error instanceof Error) {
|
||||
return NextResponse.json(
|
||||
{ error: `Bulk import failed: ${error.message}` },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.json(
|
||||
{ error: 'Bulk import failed due to an unknown error' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user