scraping and improvements

This commit is contained in:
Stefan Hardegger
2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions

View File

@@ -0,0 +1,292 @@
import { NextRequest, NextResponse } from 'next/server';
interface BulkImportRequest {
urls: string[];
}
interface ImportResult {
url: string;
status: 'imported' | 'skipped' | 'error';
reason?: string;
title?: string;
author?: string;
error?: string;
storyId?: string;
}
interface BulkImportResponse {
results: ImportResult[];
summary: {
total: number;
imported: number;
skipped: number;
errors: number;
};
}
export async function POST(request: NextRequest) {
try {
// Check for authentication
const authorization = request.headers.get('authorization');
if (!authorization) {
return NextResponse.json(
{ error: 'Authentication required for bulk import' },
{ status: 401 }
);
}
const body = await request.json();
const { urls } = body as BulkImportRequest;
if (!urls || !Array.isArray(urls) || urls.length === 0) {
return NextResponse.json(
{ error: 'URLs array is required and must not be empty' },
{ status: 400 }
);
}
if (urls.length > 50) {
return NextResponse.json(
{ error: 'Maximum 50 URLs allowed per bulk import' },
{ status: 400 }
);
}
// Dynamic imports to prevent client-side bundling
const { StoryScraper } = await import('@/lib/scraper/scraper');
const scraper = new StoryScraper();
const results: ImportResult[] = [];
let importedCount = 0;
let skippedCount = 0;
let errorCount = 0;
console.log(`Starting bulk scraping for ${urls.length} URLs`);
console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
// For server-side API calls in Docker, use direct backend container URL
// Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
const serverSideApiBaseUrl = 'http://backend:8080/api';
console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
// Quick test to verify backend connectivity
try {
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
console.log(`Backend test response status: ${testResponse.status}`);
} catch (error) {
console.error(`Backend connectivity test failed:`, error);
}
for (const url of urls) {
console.log(`Processing URL: ${url}`);
try {
// Validate URL format
if (!url || typeof url !== 'string' || url.trim() === '') {
results.push({
url: url || 'Empty URL',
status: 'error',
error: 'Invalid URL format'
});
errorCount++;
continue;
}
const trimmedUrl = url.trim();
// Scrape the story
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
// Validate required fields
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
const missingFields = [];
if (!scrapedStory.title) missingFields.push('title');
if (!scrapedStory.author) missingFields.push('author');
if (!scrapedStory.content) missingFields.push('content');
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Missing required fields: ${missingFields.join(', ')}`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
// Check for duplicates using query parameters
try {
// Use hardcoded backend URL for container-to-container communication
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
const params = new URLSearchParams({
title: scrapedStory.title,
authorName: scrapedStory.author
});
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
method: 'GET',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (duplicateCheckResponse.ok) {
const duplicateResult = await duplicateCheckResponse.json();
if (duplicateResult.hasDuplicates) {
results.push({
url: trimmedUrl,
status: 'skipped',
reason: `Duplicate story found (${duplicateResult.count} existing)`,
title: scrapedStory.title,
author: scrapedStory.author
});
skippedCount++;
continue;
}
}
} catch (error) {
console.warn('Duplicate check failed:', error);
// Continue with import if duplicate check fails
}
// Create the story
try {
const storyData = {
title: scrapedStory.title,
summary: scrapedStory.summary || undefined,
contentHtml: scrapedStory.content,
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
authorName: scrapedStory.author,
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
};
// Use hardcoded backend URL for container-to-container communication
const createUrl = `http://backend:8080/api/stories`;
console.log(`Create story URL: ${createUrl}`);
const createResponse = await fetch(createUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
body: JSON.stringify(storyData),
});
if (!createResponse.ok) {
const errorData = await createResponse.json();
throw new Error(errorData.message || 'Failed to create story');
}
const createdStory = await createResponse.json();
results.push({
url: trimmedUrl,
status: 'imported',
title: scrapedStory.title,
author: scrapedStory.author,
storyId: createdStory.id
});
importedCount++;
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
} catch (error) {
console.error(`Failed to create story for ${trimmedUrl}:`, error);
let errorMessage = 'Failed to create story';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: trimmedUrl,
status: 'error',
error: errorMessage,
title: scrapedStory.title,
author: scrapedStory.author
});
errorCount++;
}
} catch (error) {
console.error(`Error processing URL ${url}:`, error);
let errorMessage = 'Unknown error';
if (error instanceof Error) {
errorMessage = error.message;
}
results.push({
url: url,
status: 'error',
error: errorMessage
});
errorCount++;
}
}
const response: BulkImportResponse = {
results,
summary: {
total: urls.length,
imported: importedCount,
skipped: skippedCount,
errors: errorCount
}
};
console.log(`Bulk import completed:`, response.summary);
// Trigger Typesense reindex if any stories were imported
if (importedCount > 0) {
try {
console.log('Triggering Typesense reindex after bulk import...');
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
const reindexResponse = await fetch(reindexUrl, {
method: 'POST',
headers: {
'Authorization': authorization,
'Content-Type': 'application/json',
},
});
if (reindexResponse.ok) {
const reindexResult = await reindexResponse.json();
console.log('Typesense reindex completed:', reindexResult);
} else {
console.warn('Typesense reindex failed:', reindexResponse.status);
}
} catch (error) {
console.warn('Failed to trigger Typesense reindex:', error);
// Don't fail the whole request if reindex fails
}
}
return NextResponse.json(response);
} catch (error) {
console.error('Bulk import error:', error);
if (error instanceof Error) {
return NextResponse.json(
{ error: `Bulk import failed: ${error.message}` },
{ status: 500 }
);
}
return NextResponse.json(
{ error: 'Bulk import failed due to an unknown error' },
{ status: 500 }
);
}
}