storycove/frontend/src/app/scrape/bulk/route.ts

import { NextRequest, NextResponse } from 'next/server';

// Configure route timeout for long-running scraping operations
export const maxDuration = 900; // 15 minutes (900 seconds)

// Import progress tracking helper
import { sendProgressUpdate } from '../../../lib/progress';

interface BulkImportRequest {
  urls: string[];
  combineIntoOne?: boolean;
  sessionId?: string; // For progress tracking
}

interface ImportResult {
  url: string;
  status: 'imported' | 'skipped' | 'error';
  reason?: string;
  title?: string;
  author?: string;
  error?: string;
  storyId?: string;
}

interface BulkImportResponse {
  results: ImportResult[];
  summary: {
    total: number;
    imported: number;
    skipped: number;
    errors: number;
  };
  combinedStory?: {
    title: string;
    author: string;
    content: string;
    summary?: string;
    sourceUrl: string;
    tags?: string[];
  };
}

// Background processing function for combined mode
async function processCombinedMode(
  urls: string[],
  sessionId: string,
  authorization: string,
  scraper: any
) {
  const results: ImportResult[] = [];
  let importedCount = 0;
  let errorCount = 0;

  const combinedContent: string[] = [];
  let baseTitle = '';
  let baseAuthor = '';
  let baseSummary = '';
  let baseSourceUrl = '';
  const combinedTags = new Set<string>();
  let totalWordCount = 0;

  // Send initial progress update
  await sendProgressUpdate(sessionId, {
    type: 'progress',
    current: 0,
    total: urls.length,
    message: `Starting to scrape ${urls.length} URLs for combining...`,
    totalWordCount: 0
  });

  for (let i = 0; i < urls.length; i++) {
    const url = urls[i];
    console.log(`Scraping URL ${i + 1}/${urls.length} for combine: ${url}`);

    // Send progress update
    await sendProgressUpdate(sessionId, {
      type: 'progress',
      current: i,
      total: urls.length,
      message: `Scraping URL ${i + 1} of ${urls.length}...`,
      url: url,
      totalWordCount
    });

    try {
      const trimmedUrl = url.trim();
      if (!trimmedUrl) {
        results.push({
          url: url || 'Empty URL',
          status: 'error',
          error: 'Empty URL in combined mode'
        });
        errorCount++;
        continue;
      }

      const scrapedStory = await scraper.scrapeStory(trimmedUrl);

      // Check if we got content - this is required for combined mode
      if (!scrapedStory.content || scrapedStory.content.trim() === '') {
        results.push({
          url: trimmedUrl,
          status: 'error',
          error: 'No content found - required for combined mode'
        });
        errorCount++;
        continue;
      }

      // Use first URL for base metadata (title can be empty for combined mode)
      if (i === 0) {
        baseTitle = scrapedStory.title || 'Combined Story';
        baseAuthor = scrapedStory.author || 'Unknown Author';
        baseSummary = scrapedStory.summary || '';
        baseSourceUrl = trimmedUrl;
      }

      // Add content with URL separator
      combinedContent.push(`<!-- Content from: ${trimmedUrl} -->`);
      if (scrapedStory.title && i > 0) {
        combinedContent.push(`<h2>${scrapedStory.title}</h2>`);
      }
      combinedContent.push(scrapedStory.content);
      combinedContent.push('<hr/>'); // Visual separator between parts

      // Calculate word count for this story
      const textContent = scrapedStory.content.replace(/<[^>]*>/g, ''); // Strip HTML
      const wordCount = textContent.split(/\s+/).filter((word: string) => word.length > 0).length;
      totalWordCount += wordCount;

      // Collect tags from all stories
      if (scrapedStory.tags) {
        scrapedStory.tags.forEach((tag: string) => combinedTags.add(tag));
      }

      results.push({
        url: trimmedUrl,
        status: 'imported',
        title: scrapedStory.title,
        author: scrapedStory.author
      });
      importedCount++;

      // Send progress update with word count
      await sendProgressUpdate(sessionId, {
        type: 'progress',
        current: i + 1,
        total: urls.length,
        message: `Scraped "${scrapedStory.title}" (${wordCount.toLocaleString()} words)`,
        url: trimmedUrl,
        title: scrapedStory.title,
        author: scrapedStory.author,
        wordCount: wordCount,
        totalWordCount: totalWordCount
      });

    } catch (error) {
      console.error(`Error processing URL ${url} in combined mode:`, error);
      results.push({
        url: url,
        status: 'error',
        error: error instanceof Error ? error.message : 'Unknown error'
      });
      errorCount++;
    }
  }

  // If we have any errors, fail the entire combined operation
  if (errorCount > 0) {
    await sendProgressUpdate(sessionId, {
      type: 'error',
      current: urls.length,
      total: urls.length,
      message: 'Combined mode failed: some URLs could not be processed',
      error: `${errorCount} URLs failed to process`
    });
    return;
  }

  // Check content size to prevent response size issues
  const combinedContentString = combinedContent.join('\n');
  const contentSizeInMB = new Blob([combinedContentString]).size / (1024 * 1024);

  console.log(`Combined content size: ${contentSizeInMB.toFixed(2)} MB`);
  console.log(`Combined content character length: ${combinedContentString.length}`);
  console.log(`Combined content parts count: ${combinedContent.length}`);

  // Handle content truncation if needed
  let finalContent = contentSizeInMB > 10 ?
    combinedContentString.substring(0, Math.floor(combinedContentString.length * (10 / contentSizeInMB))) + '\n\n<!-- Content truncated due to size limit -->' :
    combinedContentString;

  let finalSummary = contentSizeInMB > 10 ? baseSummary + ' (Content truncated due to size limit)' : baseSummary;

  // Check if combined content has images and mark for processing
  const hasImages = /<img[^>]+src=['"'][^'"']*['"][^>]*>/i.test(finalContent);
  if (hasImages) {
    finalSummary += ' (Contains embedded images - will be processed after story creation)';
    console.log(`Combined story contains embedded images - will need processing after creation`);
  }

  // Return the combined story data via progress update
  const combinedStory = {
    title: baseTitle,
    author: baseAuthor,
    content: finalContent,
    summary: finalSummary,
    sourceUrl: baseSourceUrl,
    tags: Array.from(combinedTags),
    hasImages: hasImages
  };

  // Send completion notification for combine mode
  let completionMessage = `Combined scraping completed: ${totalWordCount.toLocaleString()} words from ${importedCount} stories`;
  if (hasImages) {
    completionMessage += ` (embedded images will be processed when story is created)`;
  }

  await sendProgressUpdate(sessionId, {
    type: 'completed',
    current: urls.length,
    total: urls.length,
    message: completionMessage,
    totalWordCount: totalWordCount,
    combinedStory: combinedStory
  });

  console.log(`Combined scraping completed: ${importedCount} URLs combined into one story`);
}

// Background processing function for individual mode
async function processIndividualMode(
  urls: string[],
  sessionId: string,
  authorization: string,
  scraper: any
) {
  const results: ImportResult[] = [];
  let importedCount = 0;
  let skippedCount = 0;
  let errorCount = 0;

  await sendProgressUpdate(sessionId, {
    type: 'progress',
    current: 0,
    total: urls.length,
    message: `Starting to import ${urls.length} URLs individually...`
  });

  for (let i = 0; i < urls.length; i++) {
    const url = urls[i];
    console.log(`Processing URL ${i + 1}/${urls.length}: ${url}`);

    await sendProgressUpdate(sessionId, {
      type: 'progress',
      current: i,
      total: urls.length,
      message: `Processing URL ${i + 1} of ${urls.length}...`,
      url: url
    });

    try {
      // Validate URL format
      if (!url || typeof url !== 'string' || url.trim() === '') {
        results.push({
          url: url || 'Empty URL',
          status: 'error',
          error: 'Invalid URL format'
        });
        errorCount++;
        continue;
      }

      const trimmedUrl = url.trim();

      // Scrape the story
      const scrapedStory = await scraper.scrapeStory(trimmedUrl);

      // Validate required fields
      if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
        const missingFields = [];
        if (!scrapedStory.title) missingFields.push('title');
        if (!scrapedStory.author) missingFields.push('author');
        if (!scrapedStory.content) missingFields.push('content');

        results.push({
          url: trimmedUrl,
          status: 'skipped',
          reason: `Missing required fields: ${missingFields.join(', ')}`,
          title: scrapedStory.title,
          author: scrapedStory.author
        });
        skippedCount++;
        continue;
      }

      // Check for duplicates using query parameters
      try {
        const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
        const params = new URLSearchParams({
          title: scrapedStory.title,
          authorName: scrapedStory.author
        });

        const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
          method: 'GET',
          headers: {
            'Authorization': authorization,
            'Content-Type': 'application/json',
          },
        });

        if (duplicateCheckResponse.ok) {
          const duplicateResult = await duplicateCheckResponse.json();
          if (duplicateResult.hasDuplicates) {
            results.push({
              url: trimmedUrl,
              status: 'skipped',
              reason: `Duplicate story found (${duplicateResult.count} existing)`,
              title: scrapedStory.title,
              author: scrapedStory.author
            });
            skippedCount++;
            continue;
          }
        }
      } catch (error) {
        console.warn('Duplicate check failed:', error);
        // Continue with import if duplicate check fails
      }

      // Create the story
      try {
        const storyData = {
          title: scrapedStory.title,
          summary: scrapedStory.summary || undefined,
          contentHtml: scrapedStory.content,
          sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
          authorName: scrapedStory.author,
          tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
        };

        const createUrl = `http://backend:8080/api/stories`;
        const createResponse = await fetch(createUrl, {
          method: 'POST',
          headers: {
            'Authorization': authorization,
            'Content-Type': 'application/json',
          },
          body: JSON.stringify(storyData),
        });

        if (!createResponse.ok) {
          const errorData = await createResponse.json();
          throw new Error(errorData.message || 'Failed to create story');
        }

        const createdStory = await createResponse.json();

        // Process embedded images if content contains images
        let imageProcessingWarnings: string[] = [];
        const hasImages = /<img[^>]+src=['"'][^'"']*['"][^>]*>/i.test(scrapedStory.content);

        if (hasImages) {
          try {
            console.log(`Processing embedded images for story: ${createdStory.id}`);
            const imageProcessUrl = `http://backend:8080/api/stories/${createdStory.id}/process-content-images`;
            const imageProcessResponse = await fetch(imageProcessUrl, {
              method: 'POST',
              headers: {
                'Authorization': authorization,
                'Content-Type': 'application/json',
              },
              body: JSON.stringify({ htmlContent: scrapedStory.content }),
            });

            if (imageProcessResponse.ok) {
              const imageResult = await imageProcessResponse.json();
              if (imageResult.hasWarnings && imageResult.warnings) {
                imageProcessingWarnings = imageResult.warnings;
                console.log(`Image processing completed with warnings for story ${createdStory.id}:`, imageResult.warnings);
              } else {
                console.log(`Image processing completed successfully for story ${createdStory.id}. Downloaded ${imageResult.downloadedImages?.length || 0} images.`);
              }

              // Update story content with processed images
              if (imageResult.processedContent && imageResult.processedContent !== scrapedStory.content) {
                const updateUrl = `http://backend:8080/api/stories/${createdStory.id}`;
                const updateResponse = await fetch(updateUrl, {
                  method: 'PUT',
                  headers: {
                    'Authorization': authorization,
                    'Content-Type': 'application/json',
                  },
                  body: JSON.stringify({
                    contentHtml: imageResult.processedContent
                  }),
                });

                if (!updateResponse.ok) {
                  console.warn(`Failed to update story content after image processing for ${createdStory.id}`);
                  imageProcessingWarnings.push('Failed to update story content with processed images');
                }
              }
            } else {
              console.warn(`Image processing failed for story ${createdStory.id}:`, imageProcessResponse.status);
              imageProcessingWarnings.push('Image processing failed');
            }
          } catch (error) {
            console.error(`Error processing images for story ${createdStory.id}:`, error);
            imageProcessingWarnings.push(`Image processing error: ${error instanceof Error ? error.message : 'Unknown error'}`);
          }
        }

        results.push({
          url: trimmedUrl,
          status: 'imported',
          title: scrapedStory.title,
          author: scrapedStory.author,
          storyId: createdStory.id
        });
        importedCount++;

        console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})${hasImages ? ` with ${imageProcessingWarnings.length > 0 ? 'warnings' : 'successful image processing'}` : ''}`);

        // Send progress update for successful import
        let progressMessage = `Imported "${scrapedStory.title}" by ${scrapedStory.author}`;
        if (hasImages) {
          progressMessage += imageProcessingWarnings.length > 0 ? ' (with image warnings)' : ' (with images)';
        }

        await sendProgressUpdate(sessionId, {
          type: 'progress',
          current: i + 1,
          total: urls.length,
          message: progressMessage,
          url: trimmedUrl,
          title: scrapedStory.title,
          author: scrapedStory.author,
          hasImages: hasImages,
          imageWarnings: imageProcessingWarnings
        });

      } catch (error) {
        console.error(`Failed to create story for ${trimmedUrl}:`, error);

        let errorMessage = 'Failed to create story';
        if (error instanceof Error) {
          errorMessage = error.message;
        }

        results.push({
          url: trimmedUrl,
          status: 'error',
          error: errorMessage,
          title: scrapedStory.title,
          author: scrapedStory.author
        });
        errorCount++;
      }

    } catch (error) {
      console.error(`Error processing URL ${url}:`, error);

      let errorMessage = 'Unknown error';
      if (error instanceof Error) {
        errorMessage = error.message;
      }

      results.push({
        url: url,
        status: 'error',
        error: errorMessage
      });
      errorCount++;
    }
  }

  // Send completion notification
  await sendProgressUpdate(sessionId, {
    type: 'completed',
    current: urls.length,
    total: urls.length,
    message: `Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`,
    results: results,
    summary: {
      total: urls.length,
      imported: importedCount,
      skipped: skippedCount,
      errors: errorCount
    }
  });

  console.log(`Bulk import completed: ${importedCount} imported, ${skippedCount} skipped, ${errorCount} errors`);

  // Trigger Solr reindex if any stories were imported
  if (importedCount > 0) {
    try {
      console.log('Triggering Solr reindex after bulk import...');
      const reindexUrl = `http://backend:8080/api/admin/search/solr/reindex`;
      const reindexResponse = await fetch(reindexUrl, {
        method: 'POST',
        headers: {
          'Authorization': authorization,
          'Content-Type': 'application/json',
        },
      });

      if (reindexResponse.ok) {
        const reindexResult = await reindexResponse.json();
        console.log('Solr reindex completed:', reindexResult);
      } else {
        console.warn('Solr reindex failed:', reindexResponse.status);
      }
    } catch (error) {
      console.warn('Failed to trigger Solr reindex:', error);
      // Don't fail the whole request if reindex fails
    }
  }
}

// Background processing function
async function processBulkImport(
  urls: string[],
  combineIntoOne: boolean,
  sessionId: string,
  authorization: string
) {
  try {
    // Dynamic imports to prevent client-side bundling
    const { StoryScraper } = await import('@/lib/scraper/scraper');

    const scraper = new StoryScraper();

    console.log(`Starting bulk scraping for ${urls.length} URLs${combineIntoOne ? ' (combine mode)' : ''}`);
    console.log(`Session ID: ${sessionId}`);

    // Quick test to verify backend connectivity
    try {
      console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
      const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
        method: 'GET',
        headers: {
          'Authorization': authorization,
          'Content-Type': 'application/json',
        },
      });
      console.log(`Backend test response status: ${testResponse.status}`);
    } catch (error) {
      console.error(`Backend connectivity test failed:`, error);
    }

    // Handle combined mode
    if (combineIntoOne) {
      await processCombinedMode(urls, sessionId, authorization, scraper);
    } else {
      // Normal individual processing mode
      await processIndividualMode(urls, sessionId, authorization, scraper);
    }

  } catch (error) {
    console.error('Background bulk import error:', error);
    await sendProgressUpdate(sessionId, {
      type: 'error',
      current: 0,
      total: urls.length,
      message: 'Bulk import failed due to an error',
      error: error instanceof Error ? error.message : 'Unknown error'
    });
  }
}

export async function POST(request: NextRequest) {
  try {
    // Check for authentication
    const authorization = request.headers.get('authorization');
    if (!authorization) {
      return NextResponse.json(
        { error: 'Authentication required for bulk import' },
        { status: 401 }
      );
    }

    const body = await request.json();
    const { urls, combineIntoOne = false, sessionId } = body as BulkImportRequest;

    if (!urls || !Array.isArray(urls) || urls.length === 0) {
      return NextResponse.json(
        { error: 'URLs array is required and must not be empty' },
        { status: 400 }
      );
    }

    if (urls.length > 200) {
      return NextResponse.json(
        { error: 'Maximum 200 URLs allowed per bulk import' },
        { status: 400 }
      );
    }

    if (!sessionId) {
      return NextResponse.json(
        { error: 'Session ID is required for progress tracking' },
        { status: 400 }
      );
    }

    // Start the background processing
    processBulkImport(urls, combineIntoOne, sessionId, authorization).catch(error => {
      console.error('Failed to start background processing:', error);
    });

    // Return immediately with session info
    return NextResponse.json({
      message: 'Bulk import started',
      sessionId: sessionId,
      totalUrls: urls.length,
      combineMode: combineIntoOne
    });

  } catch (error) {
    console.error('Bulk import initialization error:', error);

    if (error instanceof Error) {
      return NextResponse.json(
        { error: `Bulk import failed to start: ${error.message}` },
        { status: 500 }
      );
    }

    return NextResponse.json(
      { error: 'Bulk import failed to start due to an unknown error' },
      { status: 500 }
    );
  }
}