Fix epub import

This commit is contained in:
Stefan Hardegger
2025-11-22 14:29:15 +01:00
parent 75768855e2
commit b1b5bbbccd
3 changed files with 121 additions and 63 deletions

View File

@@ -1,11 +1,11 @@
FROM openjdk:17-jdk-slim
FROM eclipse-temurin:17-jdk-jammy
WORKDIR /app
# Install Maven and PostgreSQL 15 client tools
RUN apt-get update && apt-get install -y wget ca-certificates gnupg maven && \
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
echo "deb http://apt.postgresql.org/pub/repos/apt/ bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg && \
echo "deb http://apt.postgresql.org/pub/repos/apt/ jammy-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
apt-get update && \
apt-get install -y postgresql-client-15 && \
rm -rf /var/lib/apt/lists/*

View File

@@ -71,45 +71,53 @@ public class EPUBImportService {
return EPUBImportResponse.error("Invalid EPUB file format");
}
log.info("Parsing EPUB file: {}", epubFile.getOriginalFilename());
Book book = parseEPUBFile(epubFile);
log.info("Creating story entity from EPUB metadata");
Story story = createStoryFromEPUB(book, request);
log.info("Saving story to database: {}", story.getTitle());
Story savedStory = storyService.create(story);
log.info("Story saved successfully with ID: {}", savedStory.getId());
// Process embedded images if content contains any
String originalContent = story.getContentHtml();
if (originalContent != null && originalContent.contains("<img")) {
try {
log.info("Processing embedded images for story: {}", savedStory.getId());
ImageService.ContentImageProcessingResult imageResult =
imageService.processContentImages(originalContent, savedStory.getId());
// Update story content with processed images if changed
if (!imageResult.getProcessedContent().equals(originalContent)) {
log.info("Updating story content with processed images");
savedStory.setContentHtml(imageResult.getProcessedContent());
savedStory = storyService.update(savedStory.getId(), savedStory);
// Log the image processing results
log.debug("EPUB Import - Image processing completed for story {}. Downloaded {} images.",
log.info("EPUB Import - Image processing completed for story {}. Downloaded {} images.",
savedStory.getId(), imageResult.getDownloadedImages().size());
if (imageResult.hasWarnings()) {
log.debug("EPUB Import - Image processing warnings: {}",
log.warn("EPUB Import - Image processing warnings: {}",
String.join(", ", imageResult.getWarnings()));
}
}
} catch (Exception e) {
// Log error but don't fail the import
System.err.println("EPUB Import - Failed to process embedded images for story " +
savedStory.getId() + ": " + e.getMessage());
log.error("EPUB Import - Failed to process embedded images for story {}: {}",
savedStory.getId(), e.getMessage(), e);
}
}
log.info("Building import response for story: {}", savedStory.getId());
EPUBImportResponse response = EPUBImportResponse.success(savedStory.getId(), savedStory.getTitle());
response.setWordCount(savedStory.getWordCount());
response.setTotalChapters(book.getSpine().size());
if (request.getPreserveReadingPosition() != null && request.getPreserveReadingPosition()) {
log.info("Extracting and saving reading position");
ReadingPosition readingPosition = extractReadingPosition(book, savedStory);
if (readingPosition != null) {
ReadingPosition savedPosition = readingPositionRepository.save(readingPosition);
@@ -117,9 +125,11 @@ public class EPUBImportService {
}
}
log.info("EPUB import completed successfully for: {}", savedStory.getTitle());
return response;
} catch (Exception e) {
log.error("EPUB import failed with exception: {}", e.getMessage(), e);
return EPUBImportResponse.error("Failed to import EPUB: " + e.getMessage());
}
}
@@ -148,9 +158,12 @@ public class EPUBImportService {
private Story createStoryFromEPUB(Book book, EPUBImportRequest request) {
Metadata metadata = book.getMetadata();
log.info("Extracting EPUB metadata");
String title = extractTitle(metadata);
String authorName = extractAuthorName(metadata, request);
String description = extractDescription(metadata);
log.info("Extracting and sanitizing content from {} chapters", book.getSpine().size());
String content = extractContent(book);
Story story = new Story();
@@ -160,64 +173,103 @@ public class EPUBImportService {
// Extract and process cover image
if (request.getExtractCover() == null || request.getExtractCover()) {
log.info("Extracting cover image");
String coverPath = extractAndSaveCoverImage(book);
if (coverPath != null) {
log.info("Cover image saved at: {}", coverPath);
story.setCoverPath(coverPath);
}
}
if (request.getAuthorId() != null) {
try {
Author author = authorService.findById(request.getAuthorId());
story.setAuthor(author);
} catch (ResourceNotFoundException e) {
if (request.getCreateMissingAuthor()) {
Author newAuthor = createAuthor(authorName);
story.setAuthor(newAuthor);
// Handle author assignment
try {
if (request.getAuthorId() != null) {
log.info("Looking up author by ID: {}", request.getAuthorId());
try {
Author author = authorService.findById(request.getAuthorId());
story.setAuthor(author);
log.info("Author found and assigned: {}", author.getName());
} catch (ResourceNotFoundException e) {
log.warn("Author ID {} not found", request.getAuthorId());
if (request.getCreateMissingAuthor()) {
log.info("Creating new author: {}", authorName);
Author newAuthor = createAuthor(authorName);
story.setAuthor(newAuthor);
log.info("New author created with ID: {}", newAuthor.getId());
}
}
} else if (authorName != null && request.getCreateMissingAuthor()) {
log.info("Finding or creating author: {}", authorName);
Author author = findOrCreateAuthor(authorName);
story.setAuthor(author);
log.info("Author assigned: {} (ID: {})", author.getName(), author.getId());
}
} else if (authorName != null && request.getCreateMissingAuthor()) {
Author author = findOrCreateAuthor(authorName);
story.setAuthor(author);
} catch (Exception e) {
log.error("Error handling author assignment: {}", e.getMessage(), e);
throw e;
}
if (request.getSeriesId() != null && request.getSeriesVolume() != null) {
try {
Series series = seriesService.findById(request.getSeriesId());
story.setSeries(series);
story.setVolume(request.getSeriesVolume());
} catch (ResourceNotFoundException e) {
if (request.getCreateMissingSeries() && request.getSeriesName() != null) {
Series newSeries = createSeries(request.getSeriesName());
story.setSeries(newSeries);
// Handle series assignment
try {
if (request.getSeriesId() != null && request.getSeriesVolume() != null) {
log.info("Looking up series by ID: {}", request.getSeriesId());
try {
Series series = seriesService.findById(request.getSeriesId());
story.setSeries(series);
story.setVolume(request.getSeriesVolume());
log.info("Series found and assigned: {} (volume {})", series.getName(), request.getSeriesVolume());
} catch (ResourceNotFoundException e) {
log.warn("Series ID {} not found", request.getSeriesId());
if (request.getCreateMissingSeries() && request.getSeriesName() != null) {
log.info("Creating new series: {}", request.getSeriesName());
Series newSeries = createSeries(request.getSeriesName());
story.setSeries(newSeries);
story.setVolume(request.getSeriesVolume());
log.info("New series created with ID: {}", newSeries.getId());
}
}
}
} catch (Exception e) {
log.error("Error handling series assignment: {}", e.getMessage(), e);
throw e;
}
// Handle tags from request or extract from EPUB metadata
List<String> allTags = new ArrayList<>();
if (request.getTags() != null && !request.getTags().isEmpty()) {
allTags.addAll(request.getTags());
}
try {
List<String> allTags = new ArrayList<>();
if (request.getTags() != null && !request.getTags().isEmpty()) {
allTags.addAll(request.getTags());
}
// Extract subjects/keywords from EPUB metadata
List<String> epubTags = extractTags(metadata);
if (epubTags != null && !epubTags.isEmpty()) {
allTags.addAll(epubTags);
}
// Extract subjects/keywords from EPUB metadata
List<String> epubTags = extractTags(metadata);
if (epubTags != null && !epubTags.isEmpty()) {
allTags.addAll(epubTags);
}
// Remove duplicates and create tags
allTags.stream()
.distinct()
.forEach(tagName -> {
Tag tag = tagService.findOrCreate(tagName.trim());
story.addTag(tag);
});
log.info("Processing {} tags for story", allTags.size());
// Remove duplicates and create tags
allTags.stream()
.distinct()
.forEach(tagName -> {
try {
log.debug("Finding or creating tag: {}", tagName);
Tag tag = tagService.findOrCreate(tagName.trim());
story.addTag(tag);
} catch (Exception e) {
log.error("Error creating tag '{}': {}", tagName, e.getMessage(), e);
throw e;
}
});
} catch (Exception e) {
log.error("Error handling tags: {}", e.getMessage(), e);
throw e;
}
// Extract additional metadata for potential future use
extractAdditionalMetadata(metadata, story);
log.info("Story entity created successfully: {}", title);
return story;
}
@@ -244,7 +296,13 @@ public class EPUBImportService {
private String extractDescription(Metadata metadata) {
List<String> descriptions = metadata.getDescriptions();
if (descriptions != null && !descriptions.isEmpty()) {
return descriptions.get(0);
String description = descriptions.get(0);
// Truncate to 1000 characters if necessary
if (description != null && description.length() > 1000) {
log.info("Description exceeds 1000 characters ({}), truncating...", description.length());
return description.substring(0, 997) + "...";
}
return description;
}
return null;
}

View File

@@ -188,13 +188,13 @@ public class HtmlSanitizationService {
return "";
}
logger.info("Content before sanitization: "+html);
logger.debug("Sanitizing HTML content (length: {} characters)", html.length());
// Preprocess to extract images from figure tags
String preprocessed = preprocessFigureTags(html);
String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
logger.info("Content after sanitization: "+saniztedHtml);
logger.debug("Sanitization complete (output length: {} characters)", saniztedHtml.length());
return saniztedHtml;
}