scraping and improvements

2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions
--- a/frontend/src/app/scrape/bulk/route.ts
+++ b/frontend/src/app/scrape/bulk/route.ts
@@ -0,0 +1,292 @@
+import { NextRequest, NextResponse } from 'next/server';
+
+interface BulkImportRequest {
+  urls: string[];
+}
+
+interface ImportResult {
+  url: string;
+  status: 'imported' | 'skipped' | 'error';
+  reason?: string;
+  title?: string;
+  author?: string;
+  error?: string;
+  storyId?: string;
+}
+
+interface BulkImportResponse {
+  results: ImportResult[];
+  summary: {
+    total: number;
+    imported: number;
+    skipped: number;
+    errors: number;
+  };
+}
+
+export async function POST(request: NextRequest) {
+  try {
+    // Check for authentication
+    const authorization = request.headers.get('authorization');
+    if (!authorization) {
+      return NextResponse.json(
+        { error: 'Authentication required for bulk import' },
+        { status: 401 }
+      );
+    }
+
+    const body = await request.json();
+    const { urls } = body as BulkImportRequest;
+
+    if (!urls || !Array.isArray(urls) || urls.length === 0) {
+      return NextResponse.json(
+        { error: 'URLs array is required and must not be empty' },
+        { status: 400 }
+      );
+    }
+
+    if (urls.length > 50) {
+      return NextResponse.json(
+        { error: 'Maximum 50 URLs allowed per bulk import' },
+        { status: 400 }
+      );
+    }
+
+    // Dynamic imports to prevent client-side bundling
+    const { StoryScraper } = await import('@/lib/scraper/scraper');
+    
+    const scraper = new StoryScraper();
+    const results: ImportResult[] = [];
+    let importedCount = 0;
+    let skippedCount = 0;
+    let errorCount = 0;
+
+    console.log(`Starting bulk scraping for ${urls.length} URLs`);
+    console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
+    
+    // For server-side API calls in Docker, use direct backend container URL
+    // Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
+    const serverSideApiBaseUrl = 'http://backend:8080/api';
+    console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
+    
+    // Quick test to verify backend connectivity
+    try {
+      console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
+      const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
+        method: 'GET',
+        headers: {
+          'Authorization': authorization,
+          'Content-Type': 'application/json',
+        },
+      });
+      console.log(`Backend test response status: ${testResponse.status}`);
+    } catch (error) {
+      console.error(`Backend connectivity test failed:`, error);
+    }
+
+    for (const url of urls) {
+      console.log(`Processing URL: ${url}`);
+      
+      try {
+        // Validate URL format
+        if (!url || typeof url !== 'string' || url.trim() === '') {
+          results.push({
+            url: url || 'Empty URL',
+            status: 'error',
+            error: 'Invalid URL format'
+          });
+          errorCount++;
+          continue;
+        }
+
+        const trimmedUrl = url.trim();
+
+        // Scrape the story
+        const scrapedStory = await scraper.scrapeStory(trimmedUrl);
+
+        // Validate required fields
+        if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
+          const missingFields = [];
+          if (!scrapedStory.title) missingFields.push('title');
+          if (!scrapedStory.author) missingFields.push('author');
+          if (!scrapedStory.content) missingFields.push('content');
+          
+          results.push({
+            url: trimmedUrl,
+            status: 'skipped',
+            reason: `Missing required fields: ${missingFields.join(', ')}`,
+            title: scrapedStory.title,
+            author: scrapedStory.author
+          });
+          skippedCount++;
+          continue;
+        }
+
+        // Check for duplicates using query parameters
+        try {
+          // Use hardcoded backend URL for container-to-container communication
+          const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
+          console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
+          const params = new URLSearchParams({
+            title: scrapedStory.title,
+            authorName: scrapedStory.author
+          });
+          
+          const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
+            method: 'GET',
+            headers: {
+              'Authorization': authorization,
+              'Content-Type': 'application/json',
+            },
+          });
+
+          if (duplicateCheckResponse.ok) {
+            const duplicateResult = await duplicateCheckResponse.json();
+            if (duplicateResult.hasDuplicates) {
+              results.push({
+                url: trimmedUrl,
+                status: 'skipped',
+                reason: `Duplicate story found (${duplicateResult.count} existing)`,
+                title: scrapedStory.title,
+                author: scrapedStory.author
+              });
+              skippedCount++;
+              continue;
+            }
+          }
+        } catch (error) {
+          console.warn('Duplicate check failed:', error);
+          // Continue with import if duplicate check fails
+        }
+
+        // Create the story
+        try {
+          const storyData = {
+            title: scrapedStory.title,
+            summary: scrapedStory.summary || undefined,
+            contentHtml: scrapedStory.content,
+            sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
+            authorName: scrapedStory.author,
+            tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
+          };
+
+          // Use hardcoded backend URL for container-to-container communication
+          const createUrl = `http://backend:8080/api/stories`;
+          console.log(`Create story URL: ${createUrl}`);
+          const createResponse = await fetch(createUrl, {
+            method: 'POST',
+            headers: {
+              'Authorization': authorization,
+              'Content-Type': 'application/json',
+            },
+            body: JSON.stringify(storyData),
+          });
+
+          if (!createResponse.ok) {
+            const errorData = await createResponse.json();
+            throw new Error(errorData.message || 'Failed to create story');
+          }
+
+          const createdStory = await createResponse.json();
+          
+          results.push({
+            url: trimmedUrl,
+            status: 'imported',
+            title: scrapedStory.title,
+            author: scrapedStory.author,
+            storyId: createdStory.id
+          });
+          importedCount++;
+
+          console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
+
+        } catch (error) {
+          console.error(`Failed to create story for ${trimmedUrl}:`, error);
+          
+          let errorMessage = 'Failed to create story';
+          if (error instanceof Error) {
+            errorMessage = error.message;
+          }
+
+          results.push({
+            url: trimmedUrl,
+            status: 'error',
+            error: errorMessage,
+            title: scrapedStory.title,
+            author: scrapedStory.author
+          });
+          errorCount++;
+        }
+
+      } catch (error) {
+        console.error(`Error processing URL ${url}:`, error);
+        
+        let errorMessage = 'Unknown error';
+        if (error instanceof Error) {
+          errorMessage = error.message;
+        }
+
+        results.push({
+          url: url,
+          status: 'error',
+          error: errorMessage
+        });
+        errorCount++;
+      }
+    }
+
+    const response: BulkImportResponse = {
+      results,
+      summary: {
+        total: urls.length,
+        imported: importedCount,
+        skipped: skippedCount,
+        errors: errorCount
+      }
+    };
+
+    console.log(`Bulk import completed:`, response.summary);
+
+    // Trigger Typesense reindex if any stories were imported
+    if (importedCount > 0) {
+      try {
+        console.log('Triggering Typesense reindex after bulk import...');
+        const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
+        const reindexResponse = await fetch(reindexUrl, {
+          method: 'POST',
+          headers: {
+            'Authorization': authorization,
+            'Content-Type': 'application/json',
+          },
+        });
+        
+        if (reindexResponse.ok) {
+          const reindexResult = await reindexResponse.json();
+          console.log('Typesense reindex completed:', reindexResult);
+        } else {
+          console.warn('Typesense reindex failed:', reindexResponse.status);
+        }
+      } catch (error) {
+        console.warn('Failed to trigger Typesense reindex:', error);
+        // Don't fail the whole request if reindex fails
+      }
+    }
+
+    return NextResponse.json(response);
+
+  } catch (error) {
+    console.error('Bulk import error:', error);
+
+    if (error instanceof Error) {
+      return NextResponse.json(
+        { error: `Bulk import failed: ${error.message}` },
+        { status: 500 }
+      );
+    }
+
+    return NextResponse.json(
+      { error: 'Bulk import failed due to an unknown error' },
+      { status: 500 }
+    );
+  }
+}