scraping and improvements
This commit is contained in:
@@ -65,10 +65,12 @@ public class AuthorController {
|
|||||||
|
|
||||||
@PostMapping
|
@PostMapping
|
||||||
public ResponseEntity<AuthorDto> createAuthor(@Valid @RequestBody CreateAuthorRequest request) {
|
public ResponseEntity<AuthorDto> createAuthor(@Valid @RequestBody CreateAuthorRequest request) {
|
||||||
|
logger.info("Creating new author: {}", request.getName());
|
||||||
Author author = new Author();
|
Author author = new Author();
|
||||||
updateAuthorFromRequest(author, request);
|
updateAuthorFromRequest(author, request);
|
||||||
|
|
||||||
Author savedAuthor = authorService.create(author);
|
Author savedAuthor = authorService.create(author);
|
||||||
|
logger.info("Successfully created author: {} (ID: {})", savedAuthor.getName(), savedAuthor.getId());
|
||||||
return ResponseEntity.status(HttpStatus.CREATED).body(convertToDto(savedAuthor));
|
return ResponseEntity.status(HttpStatus.CREATED).body(convertToDto(savedAuthor));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,13 +83,7 @@ public class AuthorController {
|
|||||||
@RequestParam(required = false, name = "authorRating") Integer rating,
|
@RequestParam(required = false, name = "authorRating") Integer rating,
|
||||||
@RequestParam(required = false, name = "avatar") MultipartFile avatarFile) {
|
@RequestParam(required = false, name = "avatar") MultipartFile avatarFile) {
|
||||||
|
|
||||||
System.out.println("DEBUG: MULTIPART PUT called with:");
|
logger.info("Updating author with multipart data (ID: {})", id);
|
||||||
System.out.println(" - name: " + name);
|
|
||||||
System.out.println(" - notes: " + notes);
|
|
||||||
System.out.println(" - urls: " + urls);
|
|
||||||
System.out.println(" - rating: " + rating);
|
|
||||||
System.out.println(" - avatar: " + (avatarFile != null ? avatarFile.getOriginalFilename() : "null"));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Author existingAuthor = authorService.findById(id);
|
Author existingAuthor = authorService.findById(id);
|
||||||
|
|
||||||
@@ -104,7 +100,6 @@ public class AuthorController {
|
|||||||
|
|
||||||
// Handle rating update
|
// Handle rating update
|
||||||
if (rating != null) {
|
if (rating != null) {
|
||||||
System.out.println("DEBUG: Setting author rating via PUT: " + rating);
|
|
||||||
existingAuthor.setAuthorRating(rating);
|
existingAuthor.setAuthorRating(rating);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,6 +110,7 @@ public class AuthorController {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Author updatedAuthor = authorService.update(id, existingAuthor);
|
Author updatedAuthor = authorService.update(id, existingAuthor);
|
||||||
|
logger.info("Successfully updated author: {} via multipart", updatedAuthor.getName());
|
||||||
return ResponseEntity.ok(convertToDto(updatedAuthor));
|
return ResponseEntity.ok(convertToDto(updatedAuthor));
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@@ -125,31 +121,27 @@ public class AuthorController {
|
|||||||
@PutMapping(value = "/{id}", consumes = "application/json")
|
@PutMapping(value = "/{id}", consumes = "application/json")
|
||||||
public ResponseEntity<AuthorDto> updateAuthorJson(@PathVariable UUID id,
|
public ResponseEntity<AuthorDto> updateAuthorJson(@PathVariable UUID id,
|
||||||
@Valid @RequestBody UpdateAuthorRequest request) {
|
@Valid @RequestBody UpdateAuthorRequest request) {
|
||||||
System.out.println("DEBUG: JSON PUT called with:");
|
logger.info("Updating author with JSON data: {} (ID: {})", request.getName(), id);
|
||||||
System.out.println(" - name: " + request.getName());
|
|
||||||
System.out.println(" - notes: " + request.getNotes());
|
|
||||||
System.out.println(" - urls: " + request.getUrls());
|
|
||||||
System.out.println(" - rating: " + request.getRating());
|
|
||||||
|
|
||||||
Author existingAuthor = authorService.findById(id);
|
Author existingAuthor = authorService.findById(id);
|
||||||
updateAuthorFromRequest(existingAuthor, request);
|
updateAuthorFromRequest(existingAuthor, request);
|
||||||
|
|
||||||
Author updatedAuthor = authorService.update(id, existingAuthor);
|
Author updatedAuthor = authorService.update(id, existingAuthor);
|
||||||
|
logger.info("Successfully updated author: {} via JSON", updatedAuthor.getName());
|
||||||
return ResponseEntity.ok(convertToDto(updatedAuthor));
|
return ResponseEntity.ok(convertToDto(updatedAuthor));
|
||||||
}
|
}
|
||||||
|
|
||||||
@PutMapping("/{id}")
|
@PutMapping("/{id}")
|
||||||
public ResponseEntity<String> updateAuthorGeneric(@PathVariable UUID id, HttpServletRequest request) {
|
public ResponseEntity<String> updateAuthorGeneric(@PathVariable UUID id, HttpServletRequest request) {
|
||||||
System.out.println("DEBUG: GENERIC PUT called!");
|
|
||||||
System.out.println(" - Content-Type: " + request.getContentType());
|
|
||||||
System.out.println(" - Method: " + request.getMethod());
|
|
||||||
|
|
||||||
return ResponseEntity.status(415).body("Unsupported Media Type. Expected multipart/form-data or application/json");
|
return ResponseEntity.status(415).body("Unsupported Media Type. Expected multipart/form-data or application/json");
|
||||||
}
|
}
|
||||||
|
|
||||||
@DeleteMapping("/{id}")
|
@DeleteMapping("/{id}")
|
||||||
public ResponseEntity<?> deleteAuthor(@PathVariable UUID id) {
|
public ResponseEntity<?> deleteAuthor(@PathVariable UUID id) {
|
||||||
|
logger.info("Deleting author with ID: {}", id);
|
||||||
authorService.delete(id);
|
authorService.delete(id);
|
||||||
|
logger.info("Successfully deleted author with ID: {}", id);
|
||||||
return ResponseEntity.ok(Map.of("message", "Author deleted successfully"));
|
return ResponseEntity.ok(Map.of("message", "Author deleted successfully"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -177,11 +169,8 @@ public class AuthorController {
|
|||||||
|
|
||||||
@PostMapping("/{id}/rating")
|
@PostMapping("/{id}/rating")
|
||||||
public ResponseEntity<AuthorDto> rateAuthor(@PathVariable UUID id, @RequestBody RatingRequest request) {
|
public ResponseEntity<AuthorDto> rateAuthor(@PathVariable UUID id, @RequestBody RatingRequest request) {
|
||||||
System.out.println("DEBUG: Rating author " + id + " with rating " + request.getRating());
|
|
||||||
Author author = authorService.setRating(id, request.getRating());
|
Author author = authorService.setRating(id, request.getRating());
|
||||||
System.out.println("DEBUG: After setRating, author rating is: " + author.getAuthorRating());
|
|
||||||
AuthorDto dto = convertToDto(author);
|
AuthorDto dto = convertToDto(author);
|
||||||
System.out.println("DEBUG: Final DTO rating is: " + dto.getAuthorRating());
|
|
||||||
return ResponseEntity.ok(dto);
|
return ResponseEntity.ok(dto);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,9 +200,7 @@ public class AuthorController {
|
|||||||
@PostMapping("/{id}/test-rating/{rating}")
|
@PostMapping("/{id}/test-rating/{rating}")
|
||||||
public ResponseEntity<Map<String, Object>> testSetRating(@PathVariable UUID id, @PathVariable Integer rating) {
|
public ResponseEntity<Map<String, Object>> testSetRating(@PathVariable UUID id, @PathVariable Integer rating) {
|
||||||
try {
|
try {
|
||||||
System.out.println("DEBUG: Test setting rating " + rating + " for author " + id);
|
|
||||||
Author author = authorService.setRating(id, rating);
|
Author author = authorService.setRating(id, rating);
|
||||||
System.out.println("DEBUG: After test setRating, got: " + author.getAuthorRating());
|
|
||||||
|
|
||||||
return ResponseEntity.ok(Map.of(
|
return ResponseEntity.ok(Map.of(
|
||||||
"success", true,
|
"success", true,
|
||||||
@@ -231,13 +218,11 @@ public class AuthorController {
|
|||||||
@PostMapping("/{id}/test-put-rating")
|
@PostMapping("/{id}/test-put-rating")
|
||||||
public ResponseEntity<Map<String, Object>> testPutWithRating(@PathVariable UUID id, @RequestParam Integer rating) {
|
public ResponseEntity<Map<String, Object>> testPutWithRating(@PathVariable UUID id, @RequestParam Integer rating) {
|
||||||
try {
|
try {
|
||||||
System.out.println("DEBUG: Test PUT with rating " + rating + " for author " + id);
|
|
||||||
|
|
||||||
Author existingAuthor = authorService.findById(id);
|
Author existingAuthor = authorService.findById(id);
|
||||||
existingAuthor.setAuthorRating(rating);
|
existingAuthor.setAuthorRating(rating);
|
||||||
Author updatedAuthor = authorService.update(id, existingAuthor);
|
Author updatedAuthor = authorService.update(id, existingAuthor);
|
||||||
|
|
||||||
System.out.println("DEBUG: After PUT update, rating is: " + updatedAuthor.getAuthorRating());
|
|
||||||
|
|
||||||
return ResponseEntity.ok(Map.of(
|
return ResponseEntity.ok(Map.of(
|
||||||
"success", true,
|
"success", true,
|
||||||
@@ -389,7 +374,6 @@ public class AuthorController {
|
|||||||
author.setUrls(updateReq.getUrls());
|
author.setUrls(updateReq.getUrls());
|
||||||
}
|
}
|
||||||
if (updateReq.getRating() != null) {
|
if (updateReq.getRating() != null) {
|
||||||
System.out.println("DEBUG: Setting author rating via JSON: " + updateReq.getRating());
|
|
||||||
author.setAuthorRating(updateReq.getRating());
|
author.setAuthorRating(updateReq.getRating());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -402,9 +386,6 @@ public class AuthorController {
|
|||||||
dto.setNotes(author.getNotes());
|
dto.setNotes(author.getNotes());
|
||||||
dto.setAvatarImagePath(author.getAvatarImagePath());
|
dto.setAvatarImagePath(author.getAvatarImagePath());
|
||||||
|
|
||||||
// Debug logging for author rating
|
|
||||||
System.out.println("DEBUG: Converting author " + author.getName() +
|
|
||||||
" with rating: " + author.getAuthorRating());
|
|
||||||
|
|
||||||
dto.setAuthorRating(author.getAuthorRating());
|
dto.setAuthorRating(author.getAuthorRating());
|
||||||
dto.setUrls(author.getUrls());
|
dto.setUrls(author.getUrls());
|
||||||
@@ -415,7 +396,6 @@ public class AuthorController {
|
|||||||
// Calculate and set average story rating
|
// Calculate and set average story rating
|
||||||
dto.setAverageStoryRating(authorService.calculateAverageStoryRating(author.getId()));
|
dto.setAverageStoryRating(authorService.calculateAverageStoryRating(author.getId()));
|
||||||
|
|
||||||
System.out.println("DEBUG: DTO authorRating set to: " + dto.getAuthorRating());
|
|
||||||
|
|
||||||
return dto;
|
return dto;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -56,8 +56,6 @@ public class CollectionController {
|
|||||||
@RequestParam(required = false) List<String> tags,
|
@RequestParam(required = false) List<String> tags,
|
||||||
@RequestParam(defaultValue = "false") boolean archived) {
|
@RequestParam(defaultValue = "false") boolean archived) {
|
||||||
|
|
||||||
logger.info("COLLECTIONS: Search request - search='{}', tags={}, archived={}, page={}, limit={}",
|
|
||||||
search, tags, archived, page, limit);
|
|
||||||
|
|
||||||
// MANDATORY: Use Typesense for all search/filter operations
|
// MANDATORY: Use Typesense for all search/filter operations
|
||||||
SearchResultDto<Collection> results = collectionService.searchCollections(search, tags, archived, page, limit);
|
SearchResultDto<Collection> results = collectionService.searchCollections(search, tags, archived, page, limit);
|
||||||
@@ -94,13 +92,14 @@ public class CollectionController {
|
|||||||
*/
|
*/
|
||||||
@PostMapping
|
@PostMapping
|
||||||
public ResponseEntity<Collection> createCollection(@Valid @RequestBody CreateCollectionRequest request) {
|
public ResponseEntity<Collection> createCollection(@Valid @RequestBody CreateCollectionRequest request) {
|
||||||
|
logger.info("Creating new collection: {}", request.getName());
|
||||||
Collection collection = collectionService.createCollection(
|
Collection collection = collectionService.createCollection(
|
||||||
request.getName(),
|
request.getName(),
|
||||||
request.getDescription(),
|
request.getDescription(),
|
||||||
request.getTagNames(),
|
request.getTagNames(),
|
||||||
request.getStoryIds()
|
request.getStoryIds()
|
||||||
);
|
);
|
||||||
|
logger.info("Successfully created collection: {} (ID: {})", collection.getName(), collection.getId());
|
||||||
return ResponseEntity.status(HttpStatus.CREATED).body(collection);
|
return ResponseEntity.status(HttpStatus.CREATED).body(collection);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,6 +114,7 @@ public class CollectionController {
|
|||||||
@RequestParam(required = false) List<UUID> storyIds,
|
@RequestParam(required = false) List<UUID> storyIds,
|
||||||
@RequestParam(required = false, name = "coverImage") MultipartFile coverImage) {
|
@RequestParam(required = false, name = "coverImage") MultipartFile coverImage) {
|
||||||
|
|
||||||
|
logger.info("Creating new collection with image: {}", name);
|
||||||
try {
|
try {
|
||||||
// Create collection first
|
// Create collection first
|
||||||
Collection collection = collectionService.createCollection(name, description, tags, storyIds);
|
Collection collection = collectionService.createCollection(name, description, tags, storyIds);
|
||||||
@@ -128,6 +128,7 @@ public class CollectionController {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info("Successfully created collection with image: {} (ID: {})", collection.getName(), collection.getId());
|
||||||
return ResponseEntity.status(HttpStatus.CREATED).body(collection);
|
return ResponseEntity.status(HttpStatus.CREATED).body(collection);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@@ -160,7 +161,9 @@ public class CollectionController {
|
|||||||
*/
|
*/
|
||||||
@DeleteMapping("/{id}")
|
@DeleteMapping("/{id}")
|
||||||
public ResponseEntity<Map<String, String>> deleteCollection(@PathVariable UUID id) {
|
public ResponseEntity<Map<String, String>> deleteCollection(@PathVariable UUID id) {
|
||||||
|
logger.info("Deleting collection with ID: {}", id);
|
||||||
collectionService.deleteCollection(id);
|
collectionService.deleteCollection(id);
|
||||||
|
logger.info("Successfully deleted collection with ID: {}", id);
|
||||||
return ResponseEntity.ok(Map.of("message", "Collection deleted successfully"));
|
return ResponseEntity.ok(Map.of("message", "Collection deleted successfully"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -86,23 +86,29 @@ public class StoryController {
|
|||||||
|
|
||||||
@PostMapping
|
@PostMapping
|
||||||
public ResponseEntity<StoryDto> createStory(@Valid @RequestBody CreateStoryRequest request) {
|
public ResponseEntity<StoryDto> createStory(@Valid @RequestBody CreateStoryRequest request) {
|
||||||
|
logger.info("Creating new story: {}", request.getTitle());
|
||||||
Story story = new Story();
|
Story story = new Story();
|
||||||
updateStoryFromRequest(story, request);
|
updateStoryFromRequest(story, request);
|
||||||
|
|
||||||
Story savedStory = storyService.createWithTagNames(story, request.getTagNames());
|
Story savedStory = storyService.createWithTagNames(story, request.getTagNames());
|
||||||
|
logger.info("Successfully created story: {} (ID: {})", savedStory.getTitle(), savedStory.getId());
|
||||||
return ResponseEntity.status(HttpStatus.CREATED).body(convertToDto(savedStory));
|
return ResponseEntity.status(HttpStatus.CREATED).body(convertToDto(savedStory));
|
||||||
}
|
}
|
||||||
|
|
||||||
@PutMapping("/{id}")
|
@PutMapping("/{id}")
|
||||||
public ResponseEntity<StoryDto> updateStory(@PathVariable UUID id,
|
public ResponseEntity<StoryDto> updateStory(@PathVariable UUID id,
|
||||||
@Valid @RequestBody UpdateStoryRequest request) {
|
@Valid @RequestBody UpdateStoryRequest request) {
|
||||||
|
logger.info("Updating story: {} (ID: {})", request.getTitle(), id);
|
||||||
Story updatedStory = storyService.updateWithTagNames(id, request);
|
Story updatedStory = storyService.updateWithTagNames(id, request);
|
||||||
|
logger.info("Successfully updated story: {}", updatedStory.getTitle());
|
||||||
return ResponseEntity.ok(convertToDto(updatedStory));
|
return ResponseEntity.ok(convertToDto(updatedStory));
|
||||||
}
|
}
|
||||||
|
|
||||||
@DeleteMapping("/{id}")
|
@DeleteMapping("/{id}")
|
||||||
public ResponseEntity<?> deleteStory(@PathVariable UUID id) {
|
public ResponseEntity<?> deleteStory(@PathVariable UUID id) {
|
||||||
|
logger.info("Deleting story with ID: {}", id);
|
||||||
storyService.delete(id);
|
storyService.delete(id);
|
||||||
|
logger.info("Successfully deleted story with ID: {}", id);
|
||||||
return ResponseEntity.ok(Map.of("message", "Story deleted successfully"));
|
return ResponseEntity.ok(Map.of("message", "Story deleted successfully"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,7 +218,6 @@ public class StoryController {
|
|||||||
@RequestParam(required = false) String sortBy,
|
@RequestParam(required = false) String sortBy,
|
||||||
@RequestParam(required = false) String sortDir) {
|
@RequestParam(required = false) String sortDir) {
|
||||||
|
|
||||||
logger.info("CONTROLLER DEBUG: Search request - query='{}', tags={}, authors={}", query, tags, authors);
|
|
||||||
|
|
||||||
if (typesenseService != null) {
|
if (typesenseService != null) {
|
||||||
SearchResultDto<StorySearchDto> results = typesenseService.searchStories(
|
SearchResultDto<StorySearchDto> results = typesenseService.searchStories(
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ public class AuthorService {
|
|||||||
private final TypesenseService typesenseService;
|
private final TypesenseService typesenseService;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public AuthorService(AuthorRepository authorRepository, TypesenseService typesenseService) {
|
public AuthorService(AuthorRepository authorRepository, @Autowired(required = false) TypesenseService typesenseService) {
|
||||||
this.authorRepository = authorRepository;
|
this.authorRepository = authorRepository;
|
||||||
this.typesenseService = typesenseService;
|
this.typesenseService = typesenseService;
|
||||||
}
|
}
|
||||||
@@ -133,10 +133,12 @@ public class AuthorService {
|
|||||||
Author savedAuthor = authorRepository.save(author);
|
Author savedAuthor = authorRepository.save(author);
|
||||||
|
|
||||||
// Index in Typesense
|
// Index in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.indexAuthor(savedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.indexAuthor(savedAuthor);
|
||||||
logger.warn("Failed to index author in Typesense: " + savedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to index author in Typesense: " + savedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return savedAuthor;
|
return savedAuthor;
|
||||||
@@ -155,10 +157,12 @@ public class AuthorService {
|
|||||||
Author savedAuthor = authorRepository.save(existingAuthor);
|
Author savedAuthor = authorRepository.save(existingAuthor);
|
||||||
|
|
||||||
// Update in Typesense
|
// Update in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.updateAuthor(savedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.updateAuthor(savedAuthor);
|
||||||
logger.warn("Failed to update author in Typesense: " + savedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to update author in Typesense: " + savedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return savedAuthor;
|
return savedAuthor;
|
||||||
@@ -175,10 +179,12 @@ public class AuthorService {
|
|||||||
authorRepository.delete(author);
|
authorRepository.delete(author);
|
||||||
|
|
||||||
// Remove from Typesense
|
// Remove from Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.deleteAuthor(id.toString());
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.deleteAuthor(id.toString());
|
||||||
logger.warn("Failed to delete author from Typesense: " + author.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to delete author from Typesense: " + author.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -188,10 +194,12 @@ public class AuthorService {
|
|||||||
Author savedAuthor = authorRepository.save(author);
|
Author savedAuthor = authorRepository.save(author);
|
||||||
|
|
||||||
// Update in Typesense
|
// Update in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.updateAuthor(savedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.updateAuthor(savedAuthor);
|
||||||
logger.warn("Failed to update author in Typesense after adding URL: " + savedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to update author in Typesense after adding URL: " + savedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return savedAuthor;
|
return savedAuthor;
|
||||||
@@ -203,10 +211,12 @@ public class AuthorService {
|
|||||||
Author savedAuthor = authorRepository.save(author);
|
Author savedAuthor = authorRepository.save(author);
|
||||||
|
|
||||||
// Update in Typesense
|
// Update in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.updateAuthor(savedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.updateAuthor(savedAuthor);
|
||||||
logger.warn("Failed to update author in Typesense after removing URL: " + savedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to update author in Typesense after removing URL: " + savedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return savedAuthor;
|
return savedAuthor;
|
||||||
@@ -242,10 +252,12 @@ public class AuthorService {
|
|||||||
refreshedAuthor.getAuthorRating(), refreshedAuthor.getName());
|
refreshedAuthor.getAuthorRating(), refreshedAuthor.getName());
|
||||||
|
|
||||||
// Update in Typesense
|
// Update in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.updateAuthor(refreshedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.updateAuthor(refreshedAuthor);
|
||||||
logger.warn("Failed to update author in Typesense after rating: " + refreshedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to update author in Typesense after rating: " + refreshedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return refreshedAuthor;
|
return refreshedAuthor;
|
||||||
@@ -290,10 +302,12 @@ public class AuthorService {
|
|||||||
Author savedAuthor = authorRepository.save(author);
|
Author savedAuthor = authorRepository.save(author);
|
||||||
|
|
||||||
// Update in Typesense
|
// Update in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.updateAuthor(savedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.updateAuthor(savedAuthor);
|
||||||
logger.warn("Failed to update author in Typesense after setting avatar: " + savedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to update author in Typesense after setting avatar: " + savedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return savedAuthor;
|
return savedAuthor;
|
||||||
@@ -305,10 +319,12 @@ public class AuthorService {
|
|||||||
Author savedAuthor = authorRepository.save(author);
|
Author savedAuthor = authorRepository.save(author);
|
||||||
|
|
||||||
// Update in Typesense
|
// Update in Typesense
|
||||||
try {
|
if (typesenseService != null) {
|
||||||
typesenseService.updateAuthor(savedAuthor);
|
try {
|
||||||
} catch (Exception e) {
|
typesenseService.updateAuthor(savedAuthor);
|
||||||
logger.warn("Failed to update author in Typesense after removing avatar: " + savedAuthor.getName(), e);
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to update author in Typesense after removing avatar: " + savedAuthor.getName(), e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return savedAuthor;
|
return savedAuthor;
|
||||||
|
|||||||
@@ -209,8 +209,6 @@ public class TypesenseService {
|
|||||||
try {
|
try {
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
logger.info("SEARCH DEBUG: searchStories called with query='{}', tagFilters={}, authorFilters={}",
|
|
||||||
query, tagFilters, authorFilters);
|
|
||||||
|
|
||||||
// Convert 0-based page (frontend/backend) to 1-based page (Typesense)
|
// Convert 0-based page (frontend/backend) to 1-based page (Typesense)
|
||||||
int typesensePage = page + 1;
|
int typesensePage = page + 1;
|
||||||
@@ -242,15 +240,12 @@ public class TypesenseService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (tagFilters != null && !tagFilters.isEmpty()) {
|
if (tagFilters != null && !tagFilters.isEmpty()) {
|
||||||
logger.info("SEARCH DEBUG: Processing {} tag filters: {}", tagFilters.size(), tagFilters);
|
|
||||||
// Use AND logic for multiple tags - items must have ALL selected tags
|
// Use AND logic for multiple tags - items must have ALL selected tags
|
||||||
for (String tag : tagFilters) {
|
for (String tag : tagFilters) {
|
||||||
String escaped = escapeTypesenseValue(tag);
|
String escaped = escapeTypesenseValue(tag);
|
||||||
String condition = "tagNames:=" + escaped;
|
String condition = "tagNames:=" + escaped;
|
||||||
logger.info("SEARCH DEBUG: Tag '{}' -> escaped '{}' -> condition '{}'", tag, escaped, condition);
|
|
||||||
filterConditions.add(condition);
|
filterConditions.add(condition);
|
||||||
}
|
}
|
||||||
logger.info("SEARCH DEBUG: Added {} individual tag filter conditions", tagFilters.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (minRating != null) {
|
if (minRating != null) {
|
||||||
@@ -263,17 +258,14 @@ public class TypesenseService {
|
|||||||
|
|
||||||
if (!filterConditions.isEmpty()) {
|
if (!filterConditions.isEmpty()) {
|
||||||
String finalFilter = String.join(" && ", filterConditions);
|
String finalFilter = String.join(" && ", filterConditions);
|
||||||
logger.info("SEARCH DEBUG: Final filter condition: '{}'", finalFilter);
|
|
||||||
searchParameters.filterBy(finalFilter);
|
searchParameters.filterBy(finalFilter);
|
||||||
} else {
|
} else {
|
||||||
logger.info("SEARCH DEBUG: No filter conditions applied");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SearchResult searchResult = typesenseClient.collections(STORIES_COLLECTION)
|
SearchResult searchResult = typesenseClient.collections(STORIES_COLLECTION)
|
||||||
.documents()
|
.documents()
|
||||||
.search(searchParameters);
|
.search(searchParameters);
|
||||||
|
|
||||||
logger.info("SEARCH DEBUG: Typesense returned {} results", searchResult.getFound());
|
|
||||||
|
|
||||||
List<StorySearchDto> results = convertSearchResult(searchResult);
|
List<StorySearchDto> results = convertSearchResult(searchResult);
|
||||||
long searchTime = System.currentTimeMillis() - startTime;
|
long searchTime = System.currentTimeMillis() - startTime;
|
||||||
@@ -377,10 +369,8 @@ public class TypesenseService {
|
|||||||
List<String> tagNames = story.getTags().stream()
|
List<String> tagNames = story.getTags().stream()
|
||||||
.map(tag -> tag.getName())
|
.map(tag -> tag.getName())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
logger.debug("INDEXING DEBUG: Story '{}' has tags: {}", story.getTitle(), tagNames);
|
|
||||||
document.put("tagNames", tagNames);
|
document.put("tagNames", tagNames);
|
||||||
} else {
|
} else {
|
||||||
logger.debug("INDEXING DEBUG: Story '{}' has no tags", story.getTitle());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
document.put("rating", story.getRating() != null ? story.getRating() : 0);
|
document.put("rating", story.getRating() != null ? story.getRating() : 0);
|
||||||
@@ -746,8 +736,6 @@ public class TypesenseService {
|
|||||||
|
|
||||||
public SearchResultDto<AuthorSearchDto> searchAuthors(String query, int page, int perPage, String sortBy, String sortOrder) {
|
public SearchResultDto<AuthorSearchDto> searchAuthors(String query, int page, int perPage, String sortBy, String sortOrder) {
|
||||||
try {
|
try {
|
||||||
logger.info("AUTHORS SEARCH DEBUG: Searching collection '{}' with query='{}', sortBy='{}', sortOrder='{}'",
|
|
||||||
AUTHORS_COLLECTION, query, sortBy, sortOrder);
|
|
||||||
SearchParameters searchParameters = new SearchParameters()
|
SearchParameters searchParameters = new SearchParameters()
|
||||||
.q(query != null && !query.trim().isEmpty() ? query : "*")
|
.q(query != null && !query.trim().isEmpty() ? query : "*")
|
||||||
.queryBy("name,notes")
|
.queryBy("name,notes")
|
||||||
@@ -759,8 +747,6 @@ public class TypesenseService {
|
|||||||
String sortDirection = "desc".equalsIgnoreCase(sortOrder) ? "desc" : "asc";
|
String sortDirection = "desc".equalsIgnoreCase(sortOrder) ? "desc" : "asc";
|
||||||
String sortField = mapAuthorSortField(sortBy);
|
String sortField = mapAuthorSortField(sortBy);
|
||||||
String sortString = sortField + ":" + sortDirection;
|
String sortString = sortField + ":" + sortDirection;
|
||||||
logger.info("AUTHORS SEARCH DEBUG: Original sortBy='{}', mapped to='{}', full sort string='{}'",
|
|
||||||
sortBy, sortField, sortString);
|
|
||||||
searchParameters.sortBy(sortString);
|
searchParameters.sortBy(sortString);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -771,17 +757,12 @@ public class TypesenseService {
|
|||||||
.search(searchParameters);
|
.search(searchParameters);
|
||||||
} catch (Exception sortException) {
|
} catch (Exception sortException) {
|
||||||
// If sorting fails (likely due to schema issues), retry without sorting
|
// If sorting fails (likely due to schema issues), retry without sorting
|
||||||
logger.error("SORTING ERROR DEBUG: Full exception details", sortException);
|
|
||||||
logger.warn("Sorting failed for authors search, retrying without sort: " + sortException.getMessage());
|
logger.warn("Sorting failed for authors search, retrying without sort: " + sortException.getMessage());
|
||||||
|
|
||||||
// Try to get collection info for debugging
|
// Try to get collection info for debugging
|
||||||
try {
|
try {
|
||||||
CollectionResponse collection = typesenseClient.collections(AUTHORS_COLLECTION).retrieve();
|
CollectionResponse collection = typesenseClient.collections(AUTHORS_COLLECTION).retrieve();
|
||||||
logger.error("COLLECTION DEBUG: Collection '{}' exists with {} documents and {} fields",
|
|
||||||
collection.getName(), collection.getNumDocuments(), collection.getFields().size());
|
|
||||||
logger.error("COLLECTION DEBUG: Fields: {}", collection.getFields());
|
|
||||||
} catch (Exception debugException) {
|
} catch (Exception debugException) {
|
||||||
logger.error("COLLECTION DEBUG: Failed to retrieve collection info", debugException);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
searchParameters = new SearchParameters()
|
searchParameters = new SearchParameters()
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package com.storycove.service;
|
package com.storycove.service;
|
||||||
|
|
||||||
import com.storycove.entity.Author;
|
import com.storycove.entity.Author;
|
||||||
|
import com.storycove.entity.Story;
|
||||||
import com.storycove.repository.AuthorRepository;
|
import com.storycove.repository.AuthorRepository;
|
||||||
import com.storycove.service.exception.DuplicateResourceException;
|
import com.storycove.service.exception.DuplicateResourceException;
|
||||||
import com.storycove.service.exception.ResourceNotFoundException;
|
import com.storycove.service.exception.ResourceNotFoundException;
|
||||||
@@ -24,6 +25,7 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
import static org.mockito.ArgumentMatchers.anyString;
|
import static org.mockito.ArgumentMatchers.anyString;
|
||||||
import static org.mockito.Mockito.*;
|
import static org.mockito.Mockito.*;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@DisplayName("Author Service Unit Tests")
|
@DisplayName("Author Service Unit Tests")
|
||||||
@@ -32,7 +34,6 @@ class AuthorServiceTest {
|
|||||||
@Mock
|
@Mock
|
||||||
private AuthorRepository authorRepository;
|
private AuthorRepository authorRepository;
|
||||||
|
|
||||||
@InjectMocks
|
|
||||||
private AuthorService authorService;
|
private AuthorService authorService;
|
||||||
|
|
||||||
private Author testAuthor;
|
private Author testAuthor;
|
||||||
@@ -44,6 +45,9 @@ class AuthorServiceTest {
|
|||||||
testAuthor = new Author("Test Author");
|
testAuthor = new Author("Test Author");
|
||||||
testAuthor.setId(testId);
|
testAuthor.setId(testId);
|
||||||
testAuthor.setNotes("Test notes");
|
testAuthor.setNotes("Test notes");
|
||||||
|
|
||||||
|
// Initialize service with null TypesenseService (which is allowed)
|
||||||
|
authorService = new AuthorService(authorRepository, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -307,4 +311,133 @@ class AuthorServiceTest {
|
|||||||
assertEquals(5L, count);
|
assertEquals(5L, count);
|
||||||
verify(authorRepository).countRecentAuthors(any(java.time.LocalDateTime.class));
|
verify(authorRepository).countRecentAuthors(any(java.time.LocalDateTime.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should set author rating with validation")
|
||||||
|
void shouldSetAuthorRating() {
|
||||||
|
when(authorRepository.findById(testId)).thenReturn(Optional.of(testAuthor));
|
||||||
|
when(authorRepository.save(any(Author.class))).thenReturn(testAuthor);
|
||||||
|
|
||||||
|
Author result = authorService.setRating(testId, 4);
|
||||||
|
|
||||||
|
assertEquals(4, testAuthor.getAuthorRating());
|
||||||
|
verify(authorRepository, times(2)).findById(testId); // Called twice: once initially, once after flush
|
||||||
|
verify(authorRepository).save(testAuthor);
|
||||||
|
verify(authorRepository).flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should throw exception for invalid rating range")
|
||||||
|
void shouldThrowExceptionForInvalidRating() {
|
||||||
|
assertThrows(IllegalArgumentException.class, () -> authorService.setRating(testId, 0));
|
||||||
|
assertThrows(IllegalArgumentException.class, () -> authorService.setRating(testId, 6));
|
||||||
|
|
||||||
|
verify(authorRepository, never()).findById(any());
|
||||||
|
verify(authorRepository, never()).save(any());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should handle null rating")
|
||||||
|
void shouldHandleNullRating() {
|
||||||
|
when(authorRepository.findById(testId)).thenReturn(Optional.of(testAuthor));
|
||||||
|
when(authorRepository.save(any(Author.class))).thenReturn(testAuthor);
|
||||||
|
|
||||||
|
Author result = authorService.setRating(testId, null);
|
||||||
|
|
||||||
|
assertNull(testAuthor.getAuthorRating());
|
||||||
|
verify(authorRepository, times(2)).findById(testId); // Called twice: once initially, once after flush
|
||||||
|
verify(authorRepository).save(testAuthor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should find all authors with stories")
|
||||||
|
void shouldFindAllAuthorsWithStories() {
|
||||||
|
List<Author> authors = List.of(testAuthor);
|
||||||
|
when(authorRepository.findAll()).thenReturn(authors);
|
||||||
|
|
||||||
|
List<Author> result = authorService.findAllWithStories();
|
||||||
|
|
||||||
|
assertEquals(1, result.size());
|
||||||
|
verify(authorRepository).findAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should get author rating from database")
|
||||||
|
void shouldGetAuthorRatingFromDb() {
|
||||||
|
when(authorRepository.findAuthorRatingById(testId)).thenReturn(4);
|
||||||
|
|
||||||
|
Integer rating = authorService.getAuthorRatingFromDb(testId);
|
||||||
|
|
||||||
|
assertEquals(4, rating);
|
||||||
|
verify(authorRepository).findAuthorRatingById(testId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should calculate average story rating")
|
||||||
|
void shouldCalculateAverageStoryRating() {
|
||||||
|
// Setup test author with stories
|
||||||
|
Story story1 = new Story("Story 1");
|
||||||
|
story1.setRating(4);
|
||||||
|
Story story2 = new Story("Story 2");
|
||||||
|
story2.setRating(5);
|
||||||
|
|
||||||
|
testAuthor.getStories().add(story1);
|
||||||
|
testAuthor.getStories().add(story2);
|
||||||
|
|
||||||
|
when(authorRepository.findById(testId)).thenReturn(Optional.of(testAuthor));
|
||||||
|
|
||||||
|
Double avgRating = authorService.calculateAverageStoryRating(testId);
|
||||||
|
|
||||||
|
assertEquals(4.5, avgRating);
|
||||||
|
verify(authorRepository).findById(testId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should find authors with stories using repository method")
|
||||||
|
void shouldFindAuthorsWithStoriesFromRepository() {
|
||||||
|
List<Author> authors = List.of(testAuthor);
|
||||||
|
when(authorRepository.findAuthorsWithStories()).thenReturn(authors);
|
||||||
|
|
||||||
|
List<Author> result = authorService.findAuthorsWithStories();
|
||||||
|
|
||||||
|
assertEquals(1, result.size());
|
||||||
|
verify(authorRepository).findAuthorsWithStories();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should find top rated authors")
|
||||||
|
void shouldFindTopRatedAuthors() {
|
||||||
|
List<Author> authors = List.of(testAuthor);
|
||||||
|
when(authorRepository.findTopRatedAuthors()).thenReturn(authors);
|
||||||
|
|
||||||
|
List<Author> result = authorService.findTopRatedAuthors();
|
||||||
|
|
||||||
|
assertEquals(1, result.size());
|
||||||
|
verify(authorRepository).findTopRatedAuthors();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should find most prolific authors")
|
||||||
|
void shouldFindMostProlificAuthors() {
|
||||||
|
List<Author> authors = List.of(testAuthor);
|
||||||
|
when(authorRepository.findMostProlificAuthors()).thenReturn(authors);
|
||||||
|
|
||||||
|
List<Author> result = authorService.findMostProlificAuthors();
|
||||||
|
|
||||||
|
assertEquals(1, result.size());
|
||||||
|
verify(authorRepository).findMostProlificAuthors();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should find authors by URL domain")
|
||||||
|
void shouldFindAuthorsByUrlDomain() {
|
||||||
|
List<Author> authors = List.of(testAuthor);
|
||||||
|
when(authorRepository.findByUrlDomain("example.com")).thenReturn(authors);
|
||||||
|
|
||||||
|
List<Author> result = authorService.findByUrlDomain("example.com");
|
||||||
|
|
||||||
|
assertEquals(1, result.size());
|
||||||
|
verify(authorRepository).findByUrlDomain("example.com");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -1,12 +1,19 @@
|
|||||||
/** @type {import('next').NextConfig} */
|
/** @type {import('next').NextConfig} */
|
||||||
const nextConfig = {
|
const nextConfig = {
|
||||||
async rewrites() {
|
// Removed Next.js rewrites since nginx handles all API routing
|
||||||
return [
|
webpack: (config, { isServer }) => {
|
||||||
{
|
// Exclude cheerio and its dependencies from client-side bundling
|
||||||
source: '/api/:path*',
|
if (!isServer) {
|
||||||
destination: 'http://backend:8080/api/:path*',
|
config.resolve.fallback = {
|
||||||
},
|
...config.resolve.fallback,
|
||||||
];
|
fs: false,
|
||||||
|
net: false,
|
||||||
|
tls: false,
|
||||||
|
'undici': false,
|
||||||
|
};
|
||||||
|
config.externals.push('cheerio', 'server-only');
|
||||||
|
}
|
||||||
|
return config;
|
||||||
},
|
},
|
||||||
images: {
|
images: {
|
||||||
domains: ['localhost'],
|
domains: ['localhost'],
|
||||||
|
|||||||
225
frontend/package-lock.json
generated
225
frontend/package-lock.json
generated
@@ -8,14 +8,17 @@
|
|||||||
"name": "storycove-frontend",
|
"name": "storycove-frontend",
|
||||||
"version": "0.1.0",
|
"version": "0.1.0",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@heroicons/react": "^2.2.0",
|
||||||
"autoprefixer": "^10.4.16",
|
"autoprefixer": "^10.4.16",
|
||||||
"axios": "^1.6.0",
|
"axios": "^1.6.0",
|
||||||
|
"cheerio": "^1.0.0-rc.12",
|
||||||
"dompurify": "^3.0.5",
|
"dompurify": "^3.0.5",
|
||||||
"next": "14.0.0",
|
"next": "14.0.0",
|
||||||
"postcss": "^8.4.31",
|
"postcss": "^8.4.31",
|
||||||
"react": "^18",
|
"react": "^18",
|
||||||
"react-dom": "^18",
|
"react-dom": "^18",
|
||||||
"react-dropzone": "^14.2.3",
|
"react-dropzone": "^14.2.3",
|
||||||
|
"server-only": "^0.0.1",
|
||||||
"tailwindcss": "^3.3.0"
|
"tailwindcss": "^3.3.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
@@ -137,6 +140,15 @@
|
|||||||
"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
|
"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@heroicons/react": {
|
||||||
|
"version": "2.2.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@heroicons/react/-/react-2.2.0.tgz",
|
||||||
|
"integrity": "sha512-LMcepvRaS9LYHJGsF0zzmgKCUim/X3N/DQKc4jepAXJ7l8QxJ1PmxJzqplF2Z3FE4PqBAIGyJAQ/w4B5dsqbtQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"peerDependencies": {
|
||||||
|
"react": ">= 16 || ^19.0.0-rc"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@humanwhocodes/config-array": {
|
"node_modules/@humanwhocodes/config-array": {
|
||||||
"version": "0.13.0",
|
"version": "0.13.0",
|
||||||
"resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.13.0.tgz",
|
"resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.13.0.tgz",
|
||||||
@@ -1398,6 +1410,12 @@
|
|||||||
"url": "https://github.com/sponsors/sindresorhus"
|
"url": "https://github.com/sponsors/sindresorhus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/boolbase": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==",
|
||||||
|
"license": "ISC"
|
||||||
|
},
|
||||||
"node_modules/brace-expansion": {
|
"node_modules/brace-expansion": {
|
||||||
"version": "1.1.12",
|
"version": "1.1.12",
|
||||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||||
@@ -1569,6 +1587,44 @@
|
|||||||
"url": "https://github.com/chalk/chalk?sponsor=1"
|
"url": "https://github.com/chalk/chalk?sponsor=1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/cheerio": {
|
||||||
|
"version": "1.0.0-rc.12",
|
||||||
|
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz",
|
||||||
|
"integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"cheerio-select": "^2.1.0",
|
||||||
|
"dom-serializer": "^2.0.0",
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"domutils": "^3.0.1",
|
||||||
|
"htmlparser2": "^8.0.1",
|
||||||
|
"parse5": "^7.0.0",
|
||||||
|
"parse5-htmlparser2-tree-adapter": "^7.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 6"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/cheerio-select": {
|
||||||
|
"version": "2.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
|
||||||
|
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"boolbase": "^1.0.0",
|
||||||
|
"css-select": "^5.1.0",
|
||||||
|
"css-what": "^6.1.0",
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"domutils": "^3.0.1"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/chokidar": {
|
"node_modules/chokidar": {
|
||||||
"version": "3.6.0",
|
"version": "3.6.0",
|
||||||
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
|
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
|
||||||
@@ -1671,6 +1727,34 @@
|
|||||||
"node": ">= 8"
|
"node": ">= 8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/css-select": {
|
||||||
|
"version": "5.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
|
||||||
|
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"boolbase": "^1.0.0",
|
||||||
|
"css-what": "^6.1.0",
|
||||||
|
"domhandler": "^5.0.2",
|
||||||
|
"domutils": "^3.0.1",
|
||||||
|
"nth-check": "^2.0.1"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/css-what": {
|
||||||
|
"version": "6.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
|
||||||
|
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 6"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/cssesc": {
|
"node_modules/cssesc": {
|
||||||
"version": "3.0.0",
|
"version": "3.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
|
||||||
@@ -1859,6 +1943,47 @@
|
|||||||
"node": ">=6.0.0"
|
"node": ">=6.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/dom-serializer": {
|
||||||
|
"version": "2.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
||||||
|
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.2",
|
||||||
|
"entities": "^4.2.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/domelementtype": {
|
||||||
|
"version": "2.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
||||||
|
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "BSD-2-Clause"
|
||||||
|
},
|
||||||
|
"node_modules/domhandler": {
|
||||||
|
"version": "5.0.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
||||||
|
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"domelementtype": "^2.3.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 4"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/dompurify": {
|
"node_modules/dompurify": {
|
||||||
"version": "3.2.6",
|
"version": "3.2.6",
|
||||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.2.6.tgz",
|
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.2.6.tgz",
|
||||||
@@ -1868,6 +1993,20 @@
|
|||||||
"@types/trusted-types": "^2.0.7"
|
"@types/trusted-types": "^2.0.7"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/domutils": {
|
||||||
|
"version": "3.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
|
||||||
|
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"dom-serializer": "^2.0.0",
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.3"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/dunder-proto": {
|
"node_modules/dunder-proto": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
|
||||||
@@ -1900,6 +2039,18 @@
|
|||||||
"integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
|
"integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/entities": {
|
||||||
|
"version": "4.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
|
||||||
|
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/es-abstract": {
|
"node_modules/es-abstract": {
|
||||||
"version": "1.24.0",
|
"version": "1.24.0",
|
||||||
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.0.tgz",
|
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.0.tgz",
|
||||||
@@ -3096,6 +3247,25 @@
|
|||||||
"node": ">= 0.4"
|
"node": ">= 0.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/htmlparser2": {
|
||||||
|
"version": "8.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
|
||||||
|
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
|
||||||
|
"funding": [
|
||||||
|
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"domutils": "^3.0.1",
|
||||||
|
"entities": "^4.4.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/ignore": {
|
"node_modules/ignore": {
|
||||||
"version": "5.3.2",
|
"version": "5.3.2",
|
||||||
"resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
|
"resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
|
||||||
@@ -4063,6 +4233,18 @@
|
|||||||
"node": ">=0.10.0"
|
"node": ">=0.10.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/nth-check": {
|
||||||
|
"version": "2.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
||||||
|
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"boolbase": "^1.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/object-assign": {
|
"node_modules/object-assign": {
|
||||||
"version": "4.1.1",
|
"version": "4.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||||
@@ -4291,6 +4473,43 @@
|
|||||||
"node": ">=6"
|
"node": ">=6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/parse5": {
|
||||||
|
"version": "7.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
|
||||||
|
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"entities": "^6.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/parse5-htmlparser2-tree-adapter": {
|
||||||
|
"version": "7.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
|
||||||
|
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"parse5": "^7.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/parse5/node_modules/entities": {
|
||||||
|
"version": "6.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||||
|
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/path-exists": {
|
"node_modules/path-exists": {
|
||||||
"version": "4.0.0",
|
"version": "4.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
|
||||||
@@ -4843,6 +5062,12 @@
|
|||||||
"node": ">=10"
|
"node": ">=10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/server-only": {
|
||||||
|
"version": "0.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/server-only/-/server-only-0.0.1.tgz",
|
||||||
|
"integrity": "sha512-qepMx2JxAa5jjfzxG79yPPq+8BuFToHd1hm7kI+Z4zAq1ftQiP7HcxMhDDItrbtwVeLg/cY2JnKnrcFkmiswNA==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/set-function-length": {
|
"node_modules/set-function-length": {
|
||||||
"version": "1.2.2",
|
"version": "1.2.2",
|
||||||
"resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
|
"resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
|
||||||
|
|||||||
@@ -10,23 +10,26 @@
|
|||||||
"type-check": "tsc --noEmit"
|
"type-check": "tsc --noEmit"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@heroicons/react": "^2.2.0",
|
||||||
|
"autoprefixer": "^10.4.16",
|
||||||
|
"axios": "^1.6.0",
|
||||||
|
"cheerio": "^1.0.0-rc.12",
|
||||||
|
"dompurify": "^3.0.5",
|
||||||
"next": "14.0.0",
|
"next": "14.0.0",
|
||||||
|
"postcss": "^8.4.31",
|
||||||
"react": "^18",
|
"react": "^18",
|
||||||
"react-dom": "^18",
|
"react-dom": "^18",
|
||||||
"axios": "^1.6.0",
|
|
||||||
"dompurify": "^3.0.5",
|
|
||||||
"react-dropzone": "^14.2.3",
|
"react-dropzone": "^14.2.3",
|
||||||
"tailwindcss": "^3.3.0",
|
"server-only": "^0.0.1",
|
||||||
"autoprefixer": "^10.4.16",
|
"tailwindcss": "^3.3.0"
|
||||||
"postcss": "^8.4.31"
|
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"typescript": "^5",
|
"@types/dompurify": "^3.0.5",
|
||||||
"@types/node": "^20",
|
"@types/node": "^20",
|
||||||
"@types/react": "^18",
|
"@types/react": "^18",
|
||||||
"@types/react-dom": "^18",
|
"@types/react-dom": "^18",
|
||||||
"@types/dompurify": "^3.0.5",
|
|
||||||
"eslint": "^8",
|
"eslint": "^8",
|
||||||
"eslint-config-next": "14.0.0"
|
"eslint-config-next": "14.0.0",
|
||||||
|
"typescript": "^5"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -12,6 +12,9 @@ import ImageUpload from '../../components/ui/ImageUpload';
|
|||||||
import { storyApi, authorApi } from '../../lib/api';
|
import { storyApi, authorApi } from '../../lib/api';
|
||||||
|
|
||||||
export default function AddStoryPage() {
|
export default function AddStoryPage() {
|
||||||
|
const [importMode, setImportMode] = useState<'manual' | 'url'>('manual');
|
||||||
|
const [importUrl, setImportUrl] = useState('');
|
||||||
|
const [scraping, setScraping] = useState(false);
|
||||||
const [formData, setFormData] = useState({
|
const [formData, setFormData] = useState({
|
||||||
title: '',
|
title: '',
|
||||||
summary: '',
|
summary: '',
|
||||||
@@ -130,6 +133,57 @@ export default function AddStoryPage() {
|
|||||||
setFormData(prev => ({ ...prev, tags }));
|
setFormData(prev => ({ ...prev, tags }));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const handleImportFromUrl = async () => {
|
||||||
|
if (!importUrl.trim()) {
|
||||||
|
setErrors({ importUrl: 'URL is required' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setScraping(true);
|
||||||
|
setErrors({});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/scrape/story', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({ url: importUrl }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorData = await response.json();
|
||||||
|
throw new Error(errorData.error || 'Failed to scrape story');
|
||||||
|
}
|
||||||
|
|
||||||
|
const scrapedStory = await response.json();
|
||||||
|
|
||||||
|
// Pre-fill the form with scraped data
|
||||||
|
setFormData({
|
||||||
|
title: scrapedStory.title || '',
|
||||||
|
summary: scrapedStory.summary || '',
|
||||||
|
authorName: scrapedStory.author || '',
|
||||||
|
contentHtml: scrapedStory.content || '',
|
||||||
|
sourceUrl: scrapedStory.sourceUrl || importUrl,
|
||||||
|
tags: scrapedStory.tags || [],
|
||||||
|
seriesName: '',
|
||||||
|
volume: '',
|
||||||
|
});
|
||||||
|
|
||||||
|
// Switch to manual mode so user can edit the pre-filled data
|
||||||
|
setImportMode('manual');
|
||||||
|
setImportUrl('');
|
||||||
|
|
||||||
|
// Show success message
|
||||||
|
setErrors({ success: 'Story data imported successfully! Review and edit as needed before saving.' });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Failed to import story:', error);
|
||||||
|
setErrors({ importUrl: error.message });
|
||||||
|
} finally {
|
||||||
|
setScraping(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const validateForm = () => {
|
const validateForm = () => {
|
||||||
const newErrors: Record<string, string> = {};
|
const newErrors: Record<string, string> = {};
|
||||||
|
|
||||||
@@ -206,7 +260,105 @@ export default function AddStoryPage() {
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<form onSubmit={handleSubmit} className="space-y-6">
|
{/* Import Mode Toggle */}
|
||||||
|
<div className="mb-8">
|
||||||
|
<div className="flex border-b border-gray-200 dark:border-gray-700">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => setImportMode('manual')}
|
||||||
|
className={`px-6 py-3 text-sm font-medium border-b-2 transition-colors ${
|
||||||
|
importMode === 'manual'
|
||||||
|
? 'border-theme-accent text-theme-accent'
|
||||||
|
: 'border-transparent theme-text hover:text-theme-accent'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
Manual Entry
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => setImportMode('url')}
|
||||||
|
className={`px-6 py-3 text-sm font-medium border-b-2 transition-colors ${
|
||||||
|
importMode === 'url'
|
||||||
|
? 'border-theme-accent text-theme-accent'
|
||||||
|
: 'border-transparent theme-text hover:text-theme-accent'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
Import from URL
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* URL Import Section */}
|
||||||
|
{importMode === 'url' && (
|
||||||
|
<div className="bg-gray-50 dark:bg-gray-800/50 rounded-lg p-6 mb-8">
|
||||||
|
<h3 className="text-lg font-medium theme-header mb-4">Import Story from URL</h3>
|
||||||
|
<p className="theme-text text-sm mb-4">
|
||||||
|
Enter a URL from a supported story site to automatically extract the story content, title, author, and other metadata.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div className="space-y-4">
|
||||||
|
<Input
|
||||||
|
label="Story URL"
|
||||||
|
type="url"
|
||||||
|
value={importUrl}
|
||||||
|
onChange={(e) => setImportUrl(e.target.value)}
|
||||||
|
placeholder="https://example.com/story-url"
|
||||||
|
error={errors.importUrl}
|
||||||
|
disabled={scraping}
|
||||||
|
/>
|
||||||
|
|
||||||
|
<div className="flex gap-3">
|
||||||
|
<Button
|
||||||
|
type="button"
|
||||||
|
onClick={handleImportFromUrl}
|
||||||
|
loading={scraping}
|
||||||
|
disabled={!importUrl.trim() || scraping}
|
||||||
|
>
|
||||||
|
{scraping ? 'Importing...' : 'Import Story'}
|
||||||
|
</Button>
|
||||||
|
|
||||||
|
<Button
|
||||||
|
type="button"
|
||||||
|
variant="ghost"
|
||||||
|
onClick={() => setImportMode('manual')}
|
||||||
|
disabled={scraping}
|
||||||
|
>
|
||||||
|
Enter Manually Instead
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="border-t pt-4 mt-4">
|
||||||
|
<p className="text-sm theme-text mb-2">
|
||||||
|
Need to import multiple stories at once?
|
||||||
|
</p>
|
||||||
|
<Button
|
||||||
|
type="button"
|
||||||
|
variant="secondary"
|
||||||
|
onClick={() => router.push('/stories/import/bulk')}
|
||||||
|
disabled={scraping}
|
||||||
|
size="sm"
|
||||||
|
>
|
||||||
|
Bulk Import Multiple URLs
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="text-xs theme-text">
|
||||||
|
<p className="font-medium mb-1">Supported Sites:</p>
|
||||||
|
<p>Archive of Our Own, DeviantArt, FanFiction.Net, Literotica, Royal Road, Wattpad, and more</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Success Message */}
|
||||||
|
{errors.success && (
|
||||||
|
<div className="p-4 bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg mb-6">
|
||||||
|
<p className="text-green-800 dark:text-green-200">{errors.success}</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{importMode === 'manual' && (
|
||||||
|
<form onSubmit={handleSubmit} className="space-y-6">
|
||||||
{/* Title */}
|
{/* Title */}
|
||||||
<Input
|
<Input
|
||||||
label="Title *"
|
label="Title *"
|
||||||
@@ -379,6 +531,7 @@ export default function AddStoryPage() {
|
|||||||
</Button>
|
</Button>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
</AppLayout>
|
</AppLayout>
|
||||||
);
|
);
|
||||||
|
|||||||
72
frontend/src/app/scrape/author/route.ts
Normal file
72
frontend/src/app/scrape/author/route.ts
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
export async function POST(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const body = await request.json();
|
||||||
|
const { url } = body;
|
||||||
|
|
||||||
|
if (!url || typeof url !== 'string') {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'URL is required and must be a string' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dynamic import to prevent client-side bundling
|
||||||
|
const { StoryScraper } = await import('@/lib/scraper/scraper');
|
||||||
|
|
||||||
|
const scraper = new StoryScraper();
|
||||||
|
const stories = await scraper.scrapeAuthorPage(url);
|
||||||
|
|
||||||
|
return NextResponse.json({ stories });
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Author page scraping error:', error);
|
||||||
|
|
||||||
|
// Check if it's a ScraperError without importing at module level
|
||||||
|
if (error && typeof error === 'object' && error.constructor.name === 'ScraperError') {
|
||||||
|
return NextResponse.json(
|
||||||
|
{
|
||||||
|
error: (error as any).message,
|
||||||
|
url: (error as any).url
|
||||||
|
},
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error instanceof Error) {
|
||||||
|
// Handle specific error types
|
||||||
|
if (error.message.includes('Invalid URL')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Invalid URL provided' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.message.includes('not supported')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Author page scraping is not supported for this website' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.message.includes('HTTP 404')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Author page not found at the provided URL' },
|
||||||
|
{ status: 404 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.message.includes('timeout')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Request timed out while fetching content' },
|
||||||
|
{ status: 408 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to scrape author page. Please try again.' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
292
frontend/src/app/scrape/bulk/route.ts
Normal file
292
frontend/src/app/scrape/bulk/route.ts
Normal file
@@ -0,0 +1,292 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
interface BulkImportRequest {
|
||||||
|
urls: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ImportResult {
|
||||||
|
url: string;
|
||||||
|
status: 'imported' | 'skipped' | 'error';
|
||||||
|
reason?: string;
|
||||||
|
title?: string;
|
||||||
|
author?: string;
|
||||||
|
error?: string;
|
||||||
|
storyId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface BulkImportResponse {
|
||||||
|
results: ImportResult[];
|
||||||
|
summary: {
|
||||||
|
total: number;
|
||||||
|
imported: number;
|
||||||
|
skipped: number;
|
||||||
|
errors: number;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function POST(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
// Check for authentication
|
||||||
|
const authorization = request.headers.get('authorization');
|
||||||
|
if (!authorization) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Authentication required for bulk import' },
|
||||||
|
{ status: 401 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const body = await request.json();
|
||||||
|
const { urls } = body as BulkImportRequest;
|
||||||
|
|
||||||
|
if (!urls || !Array.isArray(urls) || urls.length === 0) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'URLs array is required and must not be empty' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urls.length > 50) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Maximum 50 URLs allowed per bulk import' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dynamic imports to prevent client-side bundling
|
||||||
|
const { StoryScraper } = await import('@/lib/scraper/scraper');
|
||||||
|
|
||||||
|
const scraper = new StoryScraper();
|
||||||
|
const results: ImportResult[] = [];
|
||||||
|
let importedCount = 0;
|
||||||
|
let skippedCount = 0;
|
||||||
|
let errorCount = 0;
|
||||||
|
|
||||||
|
console.log(`Starting bulk scraping for ${urls.length} URLs`);
|
||||||
|
console.log(`Environment NEXT_PUBLIC_API_URL: ${process.env.NEXT_PUBLIC_API_URL}`);
|
||||||
|
|
||||||
|
// For server-side API calls in Docker, use direct backend container URL
|
||||||
|
// Client-side calls use NEXT_PUBLIC_API_URL through nginx, but server-side needs direct container access
|
||||||
|
const serverSideApiBaseUrl = 'http://backend:8080/api';
|
||||||
|
console.log(`DEBUG: serverSideApiBaseUrl variable is: ${serverSideApiBaseUrl}`);
|
||||||
|
|
||||||
|
// Quick test to verify backend connectivity
|
||||||
|
try {
|
||||||
|
console.log(`Testing backend connectivity at: http://backend:8080/api/stories/check-duplicate`);
|
||||||
|
const testResponse = await fetch(`http://backend:8080/api/stories/check-duplicate?title=test&authorName=test`, {
|
||||||
|
method: 'GET',
|
||||||
|
headers: {
|
||||||
|
'Authorization': authorization,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
console.log(`Backend test response status: ${testResponse.status}`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Backend connectivity test failed:`, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const url of urls) {
|
||||||
|
console.log(`Processing URL: ${url}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Validate URL format
|
||||||
|
if (!url || typeof url !== 'string' || url.trim() === '') {
|
||||||
|
results.push({
|
||||||
|
url: url || 'Empty URL',
|
||||||
|
status: 'error',
|
||||||
|
error: 'Invalid URL format'
|
||||||
|
});
|
||||||
|
errorCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const trimmedUrl = url.trim();
|
||||||
|
|
||||||
|
// Scrape the story
|
||||||
|
const scrapedStory = await scraper.scrapeStory(trimmedUrl);
|
||||||
|
|
||||||
|
// Validate required fields
|
||||||
|
if (!scrapedStory.title || !scrapedStory.author || !scrapedStory.content) {
|
||||||
|
const missingFields = [];
|
||||||
|
if (!scrapedStory.title) missingFields.push('title');
|
||||||
|
if (!scrapedStory.author) missingFields.push('author');
|
||||||
|
if (!scrapedStory.content) missingFields.push('content');
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
url: trimmedUrl,
|
||||||
|
status: 'skipped',
|
||||||
|
reason: `Missing required fields: ${missingFields.join(', ')}`,
|
||||||
|
title: scrapedStory.title,
|
||||||
|
author: scrapedStory.author
|
||||||
|
});
|
||||||
|
skippedCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for duplicates using query parameters
|
||||||
|
try {
|
||||||
|
// Use hardcoded backend URL for container-to-container communication
|
||||||
|
const duplicateCheckUrl = `http://backend:8080/api/stories/check-duplicate`;
|
||||||
|
console.log(`Duplicate check URL: ${duplicateCheckUrl}`);
|
||||||
|
const params = new URLSearchParams({
|
||||||
|
title: scrapedStory.title,
|
||||||
|
authorName: scrapedStory.author
|
||||||
|
});
|
||||||
|
|
||||||
|
const duplicateCheckResponse = await fetch(`${duplicateCheckUrl}?${params.toString()}`, {
|
||||||
|
method: 'GET',
|
||||||
|
headers: {
|
||||||
|
'Authorization': authorization,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (duplicateCheckResponse.ok) {
|
||||||
|
const duplicateResult = await duplicateCheckResponse.json();
|
||||||
|
if (duplicateResult.hasDuplicates) {
|
||||||
|
results.push({
|
||||||
|
url: trimmedUrl,
|
||||||
|
status: 'skipped',
|
||||||
|
reason: `Duplicate story found (${duplicateResult.count} existing)`,
|
||||||
|
title: scrapedStory.title,
|
||||||
|
author: scrapedStory.author
|
||||||
|
});
|
||||||
|
skippedCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('Duplicate check failed:', error);
|
||||||
|
// Continue with import if duplicate check fails
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the story
|
||||||
|
try {
|
||||||
|
const storyData = {
|
||||||
|
title: scrapedStory.title,
|
||||||
|
summary: scrapedStory.summary || undefined,
|
||||||
|
contentHtml: scrapedStory.content,
|
||||||
|
sourceUrl: scrapedStory.sourceUrl || trimmedUrl,
|
||||||
|
authorName: scrapedStory.author,
|
||||||
|
tagNames: scrapedStory.tags && scrapedStory.tags.length > 0 ? scrapedStory.tags : undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Use hardcoded backend URL for container-to-container communication
|
||||||
|
const createUrl = `http://backend:8080/api/stories`;
|
||||||
|
console.log(`Create story URL: ${createUrl}`);
|
||||||
|
const createResponse = await fetch(createUrl, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Authorization': authorization,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify(storyData),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!createResponse.ok) {
|
||||||
|
const errorData = await createResponse.json();
|
||||||
|
throw new Error(errorData.message || 'Failed to create story');
|
||||||
|
}
|
||||||
|
|
||||||
|
const createdStory = await createResponse.json();
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
url: trimmedUrl,
|
||||||
|
status: 'imported',
|
||||||
|
title: scrapedStory.title,
|
||||||
|
author: scrapedStory.author,
|
||||||
|
storyId: createdStory.id
|
||||||
|
});
|
||||||
|
importedCount++;
|
||||||
|
|
||||||
|
console.log(`Successfully imported: ${scrapedStory.title} by ${scrapedStory.author} (ID: ${createdStory.id})`);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to create story for ${trimmedUrl}:`, error);
|
||||||
|
|
||||||
|
let errorMessage = 'Failed to create story';
|
||||||
|
if (error instanceof Error) {
|
||||||
|
errorMessage = error.message;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
url: trimmedUrl,
|
||||||
|
status: 'error',
|
||||||
|
error: errorMessage,
|
||||||
|
title: scrapedStory.title,
|
||||||
|
author: scrapedStory.author
|
||||||
|
});
|
||||||
|
errorCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error processing URL ${url}:`, error);
|
||||||
|
|
||||||
|
let errorMessage = 'Unknown error';
|
||||||
|
if (error instanceof Error) {
|
||||||
|
errorMessage = error.message;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
url: url,
|
||||||
|
status: 'error',
|
||||||
|
error: errorMessage
|
||||||
|
});
|
||||||
|
errorCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const response: BulkImportResponse = {
|
||||||
|
results,
|
||||||
|
summary: {
|
||||||
|
total: urls.length,
|
||||||
|
imported: importedCount,
|
||||||
|
skipped: skippedCount,
|
||||||
|
errors: errorCount
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`Bulk import completed:`, response.summary);
|
||||||
|
|
||||||
|
// Trigger Typesense reindex if any stories were imported
|
||||||
|
if (importedCount > 0) {
|
||||||
|
try {
|
||||||
|
console.log('Triggering Typesense reindex after bulk import...');
|
||||||
|
const reindexUrl = `http://backend:8080/api/stories/reindex-typesense`;
|
||||||
|
const reindexResponse = await fetch(reindexUrl, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Authorization': authorization,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (reindexResponse.ok) {
|
||||||
|
const reindexResult = await reindexResponse.json();
|
||||||
|
console.log('Typesense reindex completed:', reindexResult);
|
||||||
|
} else {
|
||||||
|
console.warn('Typesense reindex failed:', reindexResponse.status);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('Failed to trigger Typesense reindex:', error);
|
||||||
|
// Don't fail the whole request if reindex fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NextResponse.json(response);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Bulk import error:', error);
|
||||||
|
|
||||||
|
if (error instanceof Error) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: `Bulk import failed: ${error.message}` },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Bulk import failed due to an unknown error' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
85
frontend/src/app/scrape/story/route.ts
Normal file
85
frontend/src/app/scrape/story/route.ts
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
export async function POST(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const body = await request.json();
|
||||||
|
const { url } = body;
|
||||||
|
|
||||||
|
if (!url || typeof url !== 'string') {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'URL is required and must be a string' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dynamic import to prevent client-side bundling
|
||||||
|
const { StoryScraper } = await import('@/lib/scraper/scraper');
|
||||||
|
const { ScraperError } = await import('@/lib/scraper/types');
|
||||||
|
|
||||||
|
const scraper = new StoryScraper();
|
||||||
|
const story = await scraper.scrapeStory(url);
|
||||||
|
|
||||||
|
// Debug logging
|
||||||
|
console.log('Scraped story data:', {
|
||||||
|
url: url,
|
||||||
|
title: story.title,
|
||||||
|
author: story.author,
|
||||||
|
summary: story.summary,
|
||||||
|
contentLength: story.content?.length || 0,
|
||||||
|
contentPreview: story.content?.substring(0, 200) + '...',
|
||||||
|
tags: story.tags,
|
||||||
|
coverImage: story.coverImage
|
||||||
|
});
|
||||||
|
|
||||||
|
return NextResponse.json(story);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Story scraping error:', error);
|
||||||
|
|
||||||
|
// Check if it's a ScraperError without importing at module level
|
||||||
|
if (error && typeof error === 'object' && error.constructor.name === 'ScraperError') {
|
||||||
|
return NextResponse.json(
|
||||||
|
{
|
||||||
|
error: (error as any).message,
|
||||||
|
url: (error as any).url
|
||||||
|
},
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error instanceof Error) {
|
||||||
|
// Handle specific error types
|
||||||
|
if (error.message.includes('Invalid URL')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Invalid URL provided' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.message.includes('Unsupported site')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'This website is not supported for scraping' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.message.includes('HTTP 404')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Story not found at the provided URL' },
|
||||||
|
{ status: 404 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.message.includes('timeout')) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Request timed out while fetching content' },
|
||||||
|
{ status: 408 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to scrape story. Please try again.' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
300
frontend/src/app/stories/import/bulk/page.tsx
Normal file
300
frontend/src/app/stories/import/bulk/page.tsx
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
'use client';
|
||||||
|
|
||||||
|
import { useState } from 'react';
|
||||||
|
import { useRouter } from 'next/navigation';
|
||||||
|
import Link from 'next/link';
|
||||||
|
import { ArrowLeftIcon } from '@heroicons/react/24/outline';
|
||||||
|
|
||||||
|
interface ImportResult {
|
||||||
|
url: string;
|
||||||
|
status: 'imported' | 'skipped' | 'error';
|
||||||
|
reason?: string;
|
||||||
|
title?: string;
|
||||||
|
author?: string;
|
||||||
|
error?: string;
|
||||||
|
storyId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface BulkImportResponse {
|
||||||
|
results: ImportResult[];
|
||||||
|
summary: {
|
||||||
|
total: number;
|
||||||
|
imported: number;
|
||||||
|
skipped: number;
|
||||||
|
errors: number;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function BulkImportPage() {
|
||||||
|
const router = useRouter();
|
||||||
|
const [urls, setUrls] = useState('');
|
||||||
|
const [isLoading, setIsLoading] = useState(false);
|
||||||
|
const [results, setResults] = useState<BulkImportResponse | null>(null);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
|
const handleSubmit = async (e: React.FormEvent) => {
|
||||||
|
e.preventDefault();
|
||||||
|
|
||||||
|
if (!urls.trim()) {
|
||||||
|
setError('Please enter at least one URL');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setIsLoading(true);
|
||||||
|
setError(null);
|
||||||
|
setResults(null);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Parse URLs from textarea (one per line)
|
||||||
|
const urlList = urls
|
||||||
|
.split('\n')
|
||||||
|
.map(url => url.trim())
|
||||||
|
.filter(url => url.length > 0);
|
||||||
|
|
||||||
|
if (urlList.length === 0) {
|
||||||
|
setError('Please enter at least one valid URL');
|
||||||
|
setIsLoading(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlList.length > 50) {
|
||||||
|
setError('Maximum 50 URLs allowed per bulk import');
|
||||||
|
setIsLoading(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get auth token for server-side API calls
|
||||||
|
const token = localStorage.getItem('auth-token');
|
||||||
|
|
||||||
|
const response = await fetch('/scrape/bulk', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': token ? `Bearer ${token}` : '',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({ urls: urlList }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorData = await response.json();
|
||||||
|
throw new Error(errorData.error || 'Bulk import failed');
|
||||||
|
}
|
||||||
|
|
||||||
|
const data: BulkImportResponse = await response.json();
|
||||||
|
setResults(data);
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Bulk import error:', err);
|
||||||
|
setError(err instanceof Error ? err.message : 'Failed to import stories');
|
||||||
|
} finally {
|
||||||
|
setIsLoading(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleReset = () => {
|
||||||
|
setUrls('');
|
||||||
|
setResults(null);
|
||||||
|
setError(null);
|
||||||
|
};
|
||||||
|
|
||||||
|
const getStatusColor = (status: string) => {
|
||||||
|
switch (status) {
|
||||||
|
case 'imported': return 'text-green-700 bg-green-50 border-green-200';
|
||||||
|
case 'skipped': return 'text-yellow-700 bg-yellow-50 border-yellow-200';
|
||||||
|
case 'error': return 'text-red-700 bg-red-50 border-red-200';
|
||||||
|
default: return 'text-gray-700 bg-gray-50 border-gray-200';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getStatusIcon = (status: string) => {
|
||||||
|
switch (status) {
|
||||||
|
case 'imported': return '✓';
|
||||||
|
case 'skipped': return '⚠';
|
||||||
|
case 'error': return '✗';
|
||||||
|
default: return '';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="container mx-auto px-4 py-6">
|
||||||
|
<div className="max-w-4xl mx-auto">
|
||||||
|
{/* Header */}
|
||||||
|
<div className="mb-6">
|
||||||
|
<div className="flex items-center gap-4 mb-4">
|
||||||
|
<Link
|
||||||
|
href="/library"
|
||||||
|
className="inline-flex items-center text-blue-600 hover:text-blue-800"
|
||||||
|
>
|
||||||
|
<ArrowLeftIcon className="h-4 w-4 mr-1" />
|
||||||
|
Back to Library
|
||||||
|
</Link>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h1 className="text-3xl font-bold text-gray-900 mb-2">Bulk Import Stories</h1>
|
||||||
|
<p className="text-gray-600">
|
||||||
|
Import multiple stories at once by providing a list of URLs. Each URL will be scraped
|
||||||
|
and automatically added to your story collection.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{!results ? (
|
||||||
|
// Import Form
|
||||||
|
<form onSubmit={handleSubmit} className="space-y-6">
|
||||||
|
<div>
|
||||||
|
<label htmlFor="urls" className="block text-sm font-medium text-gray-700 mb-2">
|
||||||
|
Story URLs
|
||||||
|
</label>
|
||||||
|
<p className="text-sm text-gray-500 mb-3">
|
||||||
|
Enter one URL per line. Maximum 50 URLs per import.
|
||||||
|
</p>
|
||||||
|
<textarea
|
||||||
|
id="urls"
|
||||||
|
value={urls}
|
||||||
|
onChange={(e) => setUrls(e.target.value)}
|
||||||
|
placeholder="https://example.com/story1 https://example.com/story2 https://example.com/story3"
|
||||||
|
className="w-full h-64 px-3 py-2 border border-gray-300 rounded-md shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent"
|
||||||
|
disabled={isLoading}
|
||||||
|
/>
|
||||||
|
<p className="mt-2 text-sm text-gray-500">
|
||||||
|
URLs: {urls.split('\n').filter(url => url.trim().length > 0).length}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{error && (
|
||||||
|
<div className="bg-red-50 border border-red-200 rounded-md p-4">
|
||||||
|
<div className="flex">
|
||||||
|
<div className="ml-3">
|
||||||
|
<h3 className="text-sm font-medium text-red-800">Error</h3>
|
||||||
|
<div className="mt-2 text-sm text-red-700">
|
||||||
|
{error}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="flex gap-4">
|
||||||
|
<button
|
||||||
|
type="submit"
|
||||||
|
disabled={isLoading || !urls.trim()}
|
||||||
|
className="px-6 py-2 bg-blue-600 text-white font-medium rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2 disabled:opacity-50 disabled:cursor-not-allowed"
|
||||||
|
>
|
||||||
|
{isLoading ? 'Importing...' : 'Start Import'}
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={handleReset}
|
||||||
|
disabled={isLoading}
|
||||||
|
className="px-6 py-2 bg-gray-600 text-white font-medium rounded-md hover:bg-gray-700 focus:outline-none focus:ring-2 focus:ring-gray-500 focus:ring-offset-2 disabled:opacity-50 disabled:cursor-not-allowed"
|
||||||
|
>
|
||||||
|
Clear
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{isLoading && (
|
||||||
|
<div className="bg-blue-50 border border-blue-200 rounded-md p-4">
|
||||||
|
<div className="flex items-center">
|
||||||
|
<div className="animate-spin rounded-full h-5 w-5 border-b-2 border-blue-600 mr-3"></div>
|
||||||
|
<div>
|
||||||
|
<p className="text-sm font-medium text-blue-800">Processing URLs...</p>
|
||||||
|
<p className="text-sm text-blue-600">
|
||||||
|
This may take a few minutes depending on the number of URLs and response times of the source websites.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</form>
|
||||||
|
) : (
|
||||||
|
// Results
|
||||||
|
<div className="space-y-6">
|
||||||
|
{/* Summary */}
|
||||||
|
<div className="bg-white border border-gray-200 rounded-lg p-6">
|
||||||
|
<h2 className="text-xl font-semibold text-gray-900 mb-4">Import Summary</h2>
|
||||||
|
<div className="grid grid-cols-2 md:grid-cols-4 gap-4">
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-2xl font-bold text-gray-900">{results.summary.total}</div>
|
||||||
|
<div className="text-sm text-gray-600">Total URLs</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-2xl font-bold text-green-600">{results.summary.imported}</div>
|
||||||
|
<div className="text-sm text-gray-600">Imported</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-2xl font-bold text-yellow-600">{results.summary.skipped}</div>
|
||||||
|
<div className="text-sm text-gray-600">Skipped</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-2xl font-bold text-red-600">{results.summary.errors}</div>
|
||||||
|
<div className="text-sm text-gray-600">Errors</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Detailed Results */}
|
||||||
|
<div className="bg-white border border-gray-200 rounded-lg">
|
||||||
|
<div className="px-6 py-4 border-b border-gray-200">
|
||||||
|
<h3 className="text-lg font-medium text-gray-900">Detailed Results</h3>
|
||||||
|
</div>
|
||||||
|
<div className="divide-y divide-gray-200">
|
||||||
|
{results.results.map((result, index) => (
|
||||||
|
<div key={index} className="p-6">
|
||||||
|
<div className="flex items-start justify-between">
|
||||||
|
<div className="flex-1 min-w-0">
|
||||||
|
<div className="flex items-center gap-2 mb-2">
|
||||||
|
<span className={`inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium border ${getStatusColor(result.status)}`}>
|
||||||
|
{getStatusIcon(result.status)} {result.status.charAt(0).toUpperCase() + result.status.slice(1)}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p className="text-sm text-gray-900 font-medium truncate mb-1">
|
||||||
|
{result.url}
|
||||||
|
</p>
|
||||||
|
|
||||||
|
{result.title && result.author && (
|
||||||
|
<p className="text-sm text-gray-600 mb-1">
|
||||||
|
"{result.title}" by {result.author}
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{result.reason && (
|
||||||
|
<p className="text-sm text-gray-500">
|
||||||
|
{result.reason}
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{result.error && (
|
||||||
|
<p className="text-sm text-red-600">
|
||||||
|
Error: {result.error}
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Actions */}
|
||||||
|
<div className="flex gap-4">
|
||||||
|
<button
|
||||||
|
onClick={handleReset}
|
||||||
|
className="px-6 py-2 bg-blue-600 text-white font-medium rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2"
|
||||||
|
>
|
||||||
|
Import More URLs
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<Link
|
||||||
|
href="/stories"
|
||||||
|
className="px-6 py-2 bg-gray-600 text-white font-medium rounded-md hover:bg-gray-700 focus:outline-none focus:ring-2 focus:ring-gray-500 focus:ring-offset-2"
|
||||||
|
>
|
||||||
|
View Stories
|
||||||
|
</Link>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -7,6 +7,7 @@ import { useRouter } from 'next/navigation';
|
|||||||
import { useAuth } from '../../contexts/AuthContext';
|
import { useAuth } from '../../contexts/AuthContext';
|
||||||
import { useTheme } from '../../lib/theme';
|
import { useTheme } from '../../lib/theme';
|
||||||
import Button from '../ui/Button';
|
import Button from '../ui/Button';
|
||||||
|
import Dropdown from '../ui/Dropdown';
|
||||||
|
|
||||||
export default function Header() {
|
export default function Header() {
|
||||||
const [isMenuOpen, setIsMenuOpen] = useState(false);
|
const [isMenuOpen, setIsMenuOpen] = useState(false);
|
||||||
@@ -14,6 +15,24 @@ export default function Header() {
|
|||||||
const { theme, toggleTheme } = useTheme();
|
const { theme, toggleTheme } = useTheme();
|
||||||
const router = useRouter();
|
const router = useRouter();
|
||||||
|
|
||||||
|
const addStoryItems = [
|
||||||
|
{
|
||||||
|
href: '/add-story',
|
||||||
|
label: 'Manual Entry',
|
||||||
|
description: 'Add a story by manually entering details'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
href: '/stories/import',
|
||||||
|
label: 'Import from URL',
|
||||||
|
description: 'Import a single story from a website'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
href: '/stories/import/bulk',
|
||||||
|
label: 'Bulk Import',
|
||||||
|
description: 'Import multiple stories from a list of URLs'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
const handleLogout = () => {
|
const handleLogout = () => {
|
||||||
logout();
|
logout();
|
||||||
router.push('/login');
|
router.push('/login');
|
||||||
@@ -57,12 +76,10 @@ export default function Header() {
|
|||||||
>
|
>
|
||||||
Authors
|
Authors
|
||||||
</Link>
|
</Link>
|
||||||
<Link
|
<Dropdown
|
||||||
href="/add-story"
|
trigger="Add Story"
|
||||||
className="theme-text hover:theme-accent transition-colors font-medium"
|
items={addStoryItems}
|
||||||
>
|
/>
|
||||||
Add Story
|
|
||||||
</Link>
|
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
{/* Right side actions */}
|
{/* Right side actions */}
|
||||||
@@ -131,13 +148,32 @@ export default function Header() {
|
|||||||
>
|
>
|
||||||
Authors
|
Authors
|
||||||
</Link>
|
</Link>
|
||||||
<Link
|
<div className="px-2 py-1">
|
||||||
href="/add-story"
|
<div className="font-medium theme-text mb-1">Add Story</div>
|
||||||
className="theme-text hover:theme-accent transition-colors font-medium px-2 py-1"
|
<div className="pl-4 space-y-1">
|
||||||
onClick={() => setIsMenuOpen(false)}
|
<Link
|
||||||
>
|
href="/add-story"
|
||||||
Add Story
|
className="block theme-text hover:theme-accent transition-colors text-sm py-1"
|
||||||
</Link>
|
onClick={() => setIsMenuOpen(false)}
|
||||||
|
>
|
||||||
|
Manual Entry
|
||||||
|
</Link>
|
||||||
|
<Link
|
||||||
|
href="/stories/import"
|
||||||
|
className="block theme-text hover:theme-accent transition-colors text-sm py-1"
|
||||||
|
onClick={() => setIsMenuOpen(false)}
|
||||||
|
>
|
||||||
|
Import from URL
|
||||||
|
</Link>
|
||||||
|
<Link
|
||||||
|
href="/stories/import/bulk"
|
||||||
|
className="block theme-text hover:theme-accent transition-colors text-sm py-1"
|
||||||
|
onClick={() => setIsMenuOpen(false)}
|
||||||
|
>
|
||||||
|
Bulk Import
|
||||||
|
</Link>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
<Link
|
<Link
|
||||||
href="/settings"
|
href="/settings"
|
||||||
className="theme-text hover:theme-accent transition-colors font-medium px-2 py-1"
|
className="theme-text hover:theme-accent transition-colors font-medium px-2 py-1"
|
||||||
|
|||||||
98
frontend/src/components/ui/Dropdown.tsx
Normal file
98
frontend/src/components/ui/Dropdown.tsx
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
'use client';
|
||||||
|
|
||||||
|
import { useState, useRef, useEffect } from 'react';
|
||||||
|
import Link from 'next/link';
|
||||||
|
import { ChevronDownIcon } from '@heroicons/react/24/outline';
|
||||||
|
|
||||||
|
interface DropdownItem {
|
||||||
|
href: string;
|
||||||
|
label: string;
|
||||||
|
description?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface DropdownProps {
|
||||||
|
trigger: string;
|
||||||
|
items: DropdownItem[];
|
||||||
|
className?: string;
|
||||||
|
onItemClick?: () => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function Dropdown({ trigger, items, className = '', onItemClick }: DropdownProps) {
|
||||||
|
const [isOpen, setIsOpen] = useState(false);
|
||||||
|
const dropdownRef = useRef<HTMLDivElement>(null);
|
||||||
|
const timeoutRef = useRef<NodeJS.Timeout>();
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
const handleClickOutside = (event: MouseEvent) => {
|
||||||
|
if (dropdownRef.current && !dropdownRef.current.contains(event.target as Node)) {
|
||||||
|
setIsOpen(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (isOpen) {
|
||||||
|
document.addEventListener('mousedown', handleClickOutside);
|
||||||
|
}
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
document.removeEventListener('mousedown', handleClickOutside);
|
||||||
|
if (timeoutRef.current) {
|
||||||
|
clearTimeout(timeoutRef.current);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}, [isOpen]);
|
||||||
|
|
||||||
|
const handleMouseEnter = () => {
|
||||||
|
if (timeoutRef.current) {
|
||||||
|
clearTimeout(timeoutRef.current);
|
||||||
|
}
|
||||||
|
setIsOpen(true);
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleMouseLeave = () => {
|
||||||
|
timeoutRef.current = setTimeout(() => {
|
||||||
|
setIsOpen(false);
|
||||||
|
}, 150);
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleItemClick = () => {
|
||||||
|
setIsOpen(false);
|
||||||
|
onItemClick?.();
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
className={`relative ${className}`}
|
||||||
|
ref={dropdownRef}
|
||||||
|
onMouseEnter={handleMouseEnter}
|
||||||
|
onMouseLeave={handleMouseLeave}
|
||||||
|
>
|
||||||
|
<button
|
||||||
|
onClick={() => setIsOpen(!isOpen)}
|
||||||
|
className="theme-text hover:theme-accent transition-colors font-medium flex items-center gap-1"
|
||||||
|
>
|
||||||
|
{trigger}
|
||||||
|
<ChevronDownIcon
|
||||||
|
className={`h-4 w-4 transition-transform duration-200 ${isOpen ? 'rotate-180' : ''}`}
|
||||||
|
/>
|
||||||
|
</button>
|
||||||
|
|
||||||
|
{isOpen && (
|
||||||
|
<div className="absolute top-full left-0 mt-1 w-64 theme-card theme-shadow border theme-border rounded-lg py-2 z-50">
|
||||||
|
{items.map((item, index) => (
|
||||||
|
<Link
|
||||||
|
key={index}
|
||||||
|
href={item.href}
|
||||||
|
onClick={handleItemClick}
|
||||||
|
className="block px-4 py-2 theme-text hover:theme-accent transition-colors"
|
||||||
|
>
|
||||||
|
<div className="font-medium">{item.label}</div>
|
||||||
|
{item.description && (
|
||||||
|
<div className="text-sm theme-text-secondary mt-1">{item.description}</div>
|
||||||
|
)}
|
||||||
|
</Link>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
334
frontend/src/lib/scraper/config/sites.json
Normal file
334
frontend/src/lib/scraper/config/sites.json
Normal file
@@ -0,0 +1,334 @@
|
|||||||
|
{
|
||||||
|
"sites": {
|
||||||
|
"deviantart.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "h1",
|
||||||
|
"titleFallback": "meta[property='og:title']",
|
||||||
|
"titleFallbackAttribute": "content",
|
||||||
|
"author": {
|
||||||
|
"strategy": "text-pattern",
|
||||||
|
"pattern": "by ([^\\s]+) on DeviantArt",
|
||||||
|
"searchAfter": "<title>",
|
||||||
|
"searchBefore": "</title>"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"strategy": "text-blocks",
|
||||||
|
"minLength": 200,
|
||||||
|
"containerHints": ["journal", "literature", "story", "text", "content"],
|
||||||
|
"excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"]
|
||||||
|
},
|
||||||
|
"summary": "meta[property='og:description']",
|
||||||
|
"summaryAttribute": "content",
|
||||||
|
"tags": "a[data-tagname]",
|
||||||
|
"tagsAttribute": "data-tagname",
|
||||||
|
"coverImage": "meta[property='og:image']",
|
||||||
|
"coverImageAttribute": "content"
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "a[data-hook='deviation_link']",
|
||||||
|
"filterStrategy": "dom-check",
|
||||||
|
"requiresChildElement": "div[class*='journal']"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"literotica.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "h1",
|
||||||
|
"titleFallback": "meta[property='og:title']",
|
||||||
|
"titleFallbackAttribute": "content",
|
||||||
|
"author": {
|
||||||
|
"strategy": "link-with-path",
|
||||||
|
"pathContains": "/authors/",
|
||||||
|
"searchWithin": "header, .story-info, #story-meta"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"strategy": "text-blocks",
|
||||||
|
"minLength": 500,
|
||||||
|
"containerHints": ["story", "content", "text"],
|
||||||
|
"excludeSelectors": ["script", "style", "nav", "header", "footer"]
|
||||||
|
},
|
||||||
|
"summary": "meta[name='description']",
|
||||||
|
"summaryAttribute": "content",
|
||||||
|
"multiPage": {
|
||||||
|
"enabled": true,
|
||||||
|
"strategy": "url-pattern",
|
||||||
|
"pageParam": "page",
|
||||||
|
"maxPages": 20
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": {
|
||||||
|
"strategy": "href-pattern",
|
||||||
|
"pattern": "/s/[^/]+$",
|
||||||
|
"searchWithin": "main, #content, .stories-list"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"mcstories.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "title",
|
||||||
|
"titleTransform": "remove-suffix: - MCStories.com",
|
||||||
|
"author": "meta[name='dcterms.creator']",
|
||||||
|
"authorAttribute": "content",
|
||||||
|
"content": "article#mcstories",
|
||||||
|
"summary": "meta[name='dcterms.description']",
|
||||||
|
"summaryAttribute": "content"
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "a[href$='.html']:not([href*='Authors'])",
|
||||||
|
"linkPrefix": "https://mcstories.com/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"docs-lab.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "title",
|
||||||
|
"titleTransform": "remove-suffix: - Doc's Lab",
|
||||||
|
"author": "a[href*='/profiles/'] strong",
|
||||||
|
"content": {
|
||||||
|
"strategy": "html-between",
|
||||||
|
"startMarker": "<h2>Story</h2>",
|
||||||
|
"endMarker": "</div>",
|
||||||
|
"includeStart": false
|
||||||
|
},
|
||||||
|
"tags": "span.label"
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "a[href*='/submissions/']",
|
||||||
|
"linkPrefix": "https://docs-lab.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"archiveofourown.org": {
|
||||||
|
"story": {
|
||||||
|
"title": "h2.title",
|
||||||
|
"author": "a[rel='author']",
|
||||||
|
"content": {
|
||||||
|
"strategy": "chapters",
|
||||||
|
"chapterSelector": "div.userstuff[role='article']",
|
||||||
|
"chaptersWrapper": "#chapters",
|
||||||
|
"singleChapter": "#workskin"
|
||||||
|
},
|
||||||
|
"summary": "div.summary blockquote.userstuff",
|
||||||
|
"tags": {
|
||||||
|
"strategy": "multiple-types",
|
||||||
|
"selectors": {
|
||||||
|
"fandom": "dd.fandom a.tag",
|
||||||
|
"warning": "dd.warning a.tag",
|
||||||
|
"category": "dd.category a.tag",
|
||||||
|
"relationship": "dd.relationship a.tag",
|
||||||
|
"character": "dd.character a.tag",
|
||||||
|
"freeform": "dd.freeform a.tag"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"multiPage": {
|
||||||
|
"enabled": true,
|
||||||
|
"strategy": "chapter-navigation",
|
||||||
|
"chapterListSelector": "#chapter_index option",
|
||||||
|
"urlPattern": "/chapters/{chapterId}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "h4.heading a[href*='/works/']",
|
||||||
|
"pagination": {
|
||||||
|
"enabled": true,
|
||||||
|
"nextPageSelector": "li.next a[rel='next']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"fanfiction.net": {
|
||||||
|
"story": {
|
||||||
|
"title": "#profile_top b.xcontrast_txt",
|
||||||
|
"author": "#profile_top a[href*='/u/']",
|
||||||
|
"content": "#storytext",
|
||||||
|
"summary": "#profile_top div.xcontrast_txt",
|
||||||
|
"coverImage": {
|
||||||
|
"strategy": "lazy-loaded",
|
||||||
|
"selector": "img.cimage",
|
||||||
|
"attribute": "data-original"
|
||||||
|
},
|
||||||
|
"multiPage": {
|
||||||
|
"enabled": true,
|
||||||
|
"strategy": "chapter-dropdown",
|
||||||
|
"chapterSelector": "select#chap_select option",
|
||||||
|
"urlPattern": "{baseUrl}/{chapterNumber}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "div.z-list a.stitle",
|
||||||
|
"metadata": {
|
||||||
|
"strategy": "sibling-text",
|
||||||
|
"metadataSelector": "div.z-padtop2",
|
||||||
|
"parsePattern": "Rated: ([^-]+) - .+ - Chapters: (\\d+)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"royalroad.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "h1[property='name']",
|
||||||
|
"author": "h4[property='author'] a",
|
||||||
|
"content": {
|
||||||
|
"strategy": "chapter-content",
|
||||||
|
"selector": "div.chapter-content",
|
||||||
|
"cleanupSelectors": [".portlet", ".ads-holder", "div[style*='display:none']"]
|
||||||
|
},
|
||||||
|
"summary": "div.description div.hidden-content",
|
||||||
|
"tags": "span.tags a.fiction-tag",
|
||||||
|
"coverImage": "img.thumbnail",
|
||||||
|
"coverImageAttribute": "src",
|
||||||
|
"multiPage": {
|
||||||
|
"enabled": true,
|
||||||
|
"strategy": "table-of-contents",
|
||||||
|
"tocSelector": "table#chapters tbody tr a[href*='/chapter/']",
|
||||||
|
"requiresAuth": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "div.fiction-list-item h2.fiction-title a",
|
||||||
|
"additionalInfo": {
|
||||||
|
"strategy": "data-attributes",
|
||||||
|
"statsSelector": "div.stats",
|
||||||
|
"extractStats": ["pages", "followers", "views"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"wattpad.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "h1",
|
||||||
|
"author": {
|
||||||
|
"strategy": "schema-org",
|
||||||
|
"schemaType": "Person",
|
||||||
|
"property": "name",
|
||||||
|
"fallbackSelector": "a[href*='/user/']"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"strategy": "react-content",
|
||||||
|
"contentClass": "pre-wrap",
|
||||||
|
"paragraphSelector": "p[data-p-id]",
|
||||||
|
"requiresJavaScript": true
|
||||||
|
},
|
||||||
|
"summary": "h2.description",
|
||||||
|
"tags": "div.tag-items a.tag",
|
||||||
|
"coverImage": {
|
||||||
|
"strategy": "responsive-image",
|
||||||
|
"selector": "img[alt*='cover']",
|
||||||
|
"srcsetAttribute": "srcset",
|
||||||
|
"selectLargest": true
|
||||||
|
},
|
||||||
|
"multiPage": {
|
||||||
|
"enabled": true,
|
||||||
|
"strategy": "api-based",
|
||||||
|
"apiPattern": "/v4/parts/{partId}/text",
|
||||||
|
"tocApiPattern": "/v5/stories/{storyId}/parts",
|
||||||
|
"requiresAuth": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": {
|
||||||
|
"strategy": "infinite-scroll",
|
||||||
|
"initialSelector": "a[href*='/story/']",
|
||||||
|
"apiEndpoint": "/v4/users/{userId}/stories",
|
||||||
|
"requiresJavaScript": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"strategies": {
|
||||||
|
"text-blocks": {
|
||||||
|
"description": "Find content by looking for large text blocks",
|
||||||
|
"implementation": "Find all text nodes, group by parent, select parent with most text"
|
||||||
|
},
|
||||||
|
"link-with-path": {
|
||||||
|
"description": "Find links containing specific path patterns",
|
||||||
|
"implementation": "querySelector with href*= or iterate and check .href property"
|
||||||
|
},
|
||||||
|
"href-pattern": {
|
||||||
|
"description": "Match links by regex pattern",
|
||||||
|
"implementation": "Array.from(links).filter(a => pattern.test(a.href))"
|
||||||
|
},
|
||||||
|
"text-pattern": {
|
||||||
|
"description": "Extract text using regex from raw HTML",
|
||||||
|
"implementation": "Use regex on .html() with proper groups"
|
||||||
|
},
|
||||||
|
"html-between": {
|
||||||
|
"description": "Extract HTML between markers",
|
||||||
|
"implementation": "indexOf() to find positions, substring to extract"
|
||||||
|
},
|
||||||
|
"chapters": {
|
||||||
|
"description": "Extract story content that may be in chapters",
|
||||||
|
"implementation": "Check for multiple chapters or single chapter format"
|
||||||
|
},
|
||||||
|
"multiple-types": {
|
||||||
|
"description": "Extract different categories of tags",
|
||||||
|
"implementation": "Map over selector types and extract each category"
|
||||||
|
},
|
||||||
|
"chapter-navigation": {
|
||||||
|
"description": "Navigate through chapters using chapter index",
|
||||||
|
"implementation": "Extract chapter IDs and construct URLs"
|
||||||
|
},
|
||||||
|
"lazy-loaded": {
|
||||||
|
"description": "Extract images that are lazy-loaded",
|
||||||
|
"implementation": "Check data-* attributes for actual image source"
|
||||||
|
},
|
||||||
|
"chapter-dropdown": {
|
||||||
|
"description": "Handle stories with chapter selection dropdown",
|
||||||
|
"implementation": "Parse dropdown options and construct chapter URLs"
|
||||||
|
},
|
||||||
|
"table-of-contents": {
|
||||||
|
"description": "Extract chapters from a table of contents",
|
||||||
|
"implementation": "Find all chapter links in TOC structure"
|
||||||
|
},
|
||||||
|
"schema-org": {
|
||||||
|
"description": "Extract data from schema.org structured data",
|
||||||
|
"implementation": "Parse JSON-LD or microdata for specific properties"
|
||||||
|
},
|
||||||
|
"react-content": {
|
||||||
|
"description": "Extract content from React-rendered pages",
|
||||||
|
"implementation": "May require JavaScript execution or API access"
|
||||||
|
},
|
||||||
|
"responsive-image": {
|
||||||
|
"description": "Select best quality from responsive images",
|
||||||
|
"implementation": "Parse srcset and select highest resolution"
|
||||||
|
},
|
||||||
|
"api-based": {
|
||||||
|
"description": "Use API endpoints instead of HTML scraping",
|
||||||
|
"implementation": "Detect API patterns and make direct API calls"
|
||||||
|
},
|
||||||
|
"infinite-scroll": {
|
||||||
|
"description": "Handle pages with infinite scroll",
|
||||||
|
"implementation": "Detect scroll API endpoints or pagination"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"globalOptions": {
|
||||||
|
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||||
|
"timeout": 30000,
|
||||||
|
"retryAttempts": 3,
|
||||||
|
"rateLimitMs": 1000,
|
||||||
|
"cacheDuration": 300000,
|
||||||
|
"javascriptTimeout": 10000
|
||||||
|
},
|
||||||
|
|
||||||
|
"siteNotes": {
|
||||||
|
"wattpad.com": {
|
||||||
|
"warning": "Wattpad has aggressive anti-scraping measures. Consider using their API if available.",
|
||||||
|
"requiresAuth": "Some stories may require login to access full content"
|
||||||
|
},
|
||||||
|
"royalroad.com": {
|
||||||
|
"note": "Very scraper-friendly with good HTML structure"
|
||||||
|
},
|
||||||
|
"archiveofourown.org": {
|
||||||
|
"note": "Respects robots.txt, has good semantic HTML",
|
||||||
|
"rateLimit": "Be extra respectful of rate limits"
|
||||||
|
},
|
||||||
|
"fanfiction.net": {
|
||||||
|
"note": "Older site with simpler HTML structure",
|
||||||
|
"warning": "Known to block IPs for aggressive scraping"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
379
frontend/src/lib/scraper/scraper.ts
Normal file
379
frontend/src/lib/scraper/scraper.ts
Normal file
@@ -0,0 +1,379 @@
|
|||||||
|
import 'server-only';
|
||||||
|
|
||||||
|
// Note: cheerio import is done dynamically to avoid client-side bundling issues
|
||||||
|
// Using any type for CheerioAPI to prevent bundling issues
|
||||||
|
import {
|
||||||
|
SitesConfig,
|
||||||
|
SiteConfig,
|
||||||
|
ScrapedStory,
|
||||||
|
ScrapedAuthorStory,
|
||||||
|
SelectorStrategy,
|
||||||
|
MultiPageConfig,
|
||||||
|
ScraperError
|
||||||
|
} from './types';
|
||||||
|
import { RateLimiter } from './utils/rateLimit';
|
||||||
|
import { ScraperCache } from './utils/cache';
|
||||||
|
import { UrlParser } from './utils/urlParser';
|
||||||
|
import {
|
||||||
|
extractByTextPattern,
|
||||||
|
extractTextBlocks,
|
||||||
|
extractHtmlBetween,
|
||||||
|
extractLinkText,
|
||||||
|
extractLinkWithPath,
|
||||||
|
extractHrefPattern,
|
||||||
|
extractFirstImage,
|
||||||
|
extractResponsiveImage,
|
||||||
|
extractLazyLoadedImage,
|
||||||
|
extractChapters,
|
||||||
|
extractChapterContent,
|
||||||
|
extractMultipleTypes,
|
||||||
|
extractSchemaOrg,
|
||||||
|
extractReactContent,
|
||||||
|
cleanHtml,
|
||||||
|
extractAttribute
|
||||||
|
} from './strategies';
|
||||||
|
import sitesConfig from './config/sites.json';
|
||||||
|
|
||||||
|
export class StoryScraper {
|
||||||
|
private config: SitesConfig;
|
||||||
|
private cache: ScraperCache;
|
||||||
|
private rateLimiter: RateLimiter;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.config = sitesConfig as SitesConfig;
|
||||||
|
this.cache = new ScraperCache(this.config.globalOptions.cacheDuration);
|
||||||
|
this.rateLimiter = new RateLimiter(this.config.globalOptions.rateLimitMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapeStory(url: string): Promise<ScrapedStory> {
|
||||||
|
try {
|
||||||
|
if (!UrlParser.validateUrl(url)) {
|
||||||
|
throw new Error(`Invalid URL: ${url}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const domain = UrlParser.getDomain(url);
|
||||||
|
const siteConfig = this.config.sites[domain];
|
||||||
|
|
||||||
|
if (!siteConfig) {
|
||||||
|
throw new Error(`Unsupported site: ${domain}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await this.fetchWithCache(url);
|
||||||
|
const cheerio = await import('cheerio');
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
const story: ScrapedStory = {
|
||||||
|
title: await this.extractFieldWithFallback($, siteConfig.story, 'title', html),
|
||||||
|
author: await this.extractFieldWithFallback($, siteConfig.story, 'author', html),
|
||||||
|
content: await this.extractContent($, siteConfig.story, url, html),
|
||||||
|
sourceUrl: url
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract optional fields
|
||||||
|
if (siteConfig.story.summary) {
|
||||||
|
story.summary = await this.extractField($, siteConfig.story.summary, html, siteConfig.story.summaryAttribute);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (siteConfig.story.coverImage) {
|
||||||
|
story.coverImage = await this.extractField($, siteConfig.story.coverImage, html, siteConfig.story.coverImageAttribute);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (siteConfig.story.tags) {
|
||||||
|
const tagsResult = await this.extractTags($, siteConfig.story.tags, html, siteConfig.story.tagsAttribute);
|
||||||
|
if (Array.isArray(tagsResult)) {
|
||||||
|
story.tags = tagsResult;
|
||||||
|
} else if (typeof tagsResult === 'string' && tagsResult) {
|
||||||
|
story.tags = [tagsResult];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply post-processing
|
||||||
|
story.title = this.applyTransforms(story.title, siteConfig.story.titleTransform);
|
||||||
|
story.content = await cleanHtml(story.content);
|
||||||
|
|
||||||
|
return story;
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error) {
|
||||||
|
throw new ScraperError(
|
||||||
|
`Failed to scrape ${url}: ${error.message}`,
|
||||||
|
url,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapeAuthorPage(url: string): Promise<ScrapedAuthorStory[]> {
|
||||||
|
try {
|
||||||
|
if (!UrlParser.validateUrl(url)) {
|
||||||
|
throw new Error(`Invalid URL: ${url}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const domain = UrlParser.getDomain(url);
|
||||||
|
const siteConfig = this.config.sites[domain];
|
||||||
|
|
||||||
|
if (!siteConfig || !siteConfig.authorPage) {
|
||||||
|
throw new Error(`Author page scraping not supported for: ${domain}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await this.fetchWithCache(url);
|
||||||
|
const cheerio = await import('cheerio');
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
const storyLinks = await this.extractField($, siteConfig.authorPage.storyLinks, html);
|
||||||
|
const stories: ScrapedAuthorStory[] = [];
|
||||||
|
|
||||||
|
if (Array.isArray(storyLinks)) {
|
||||||
|
for (const link of storyLinks) {
|
||||||
|
const storyUrl = UrlParser.normalizeUrl(link, url);
|
||||||
|
try {
|
||||||
|
const scrapedStory = await this.scrapeStory(storyUrl);
|
||||||
|
stories.push({
|
||||||
|
url: storyUrl,
|
||||||
|
title: scrapedStory.title,
|
||||||
|
author: scrapedStory.author,
|
||||||
|
summary: scrapedStory.summary
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`Failed to scrape story ${storyUrl}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return stories;
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error) {
|
||||||
|
throw new ScraperError(
|
||||||
|
`Failed to scrape author page ${url}: ${error.message}`,
|
||||||
|
url,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractFieldWithFallback(
|
||||||
|
$: any,
|
||||||
|
config: any,
|
||||||
|
fieldName: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string> {
|
||||||
|
const primarySelector = config[fieldName];
|
||||||
|
const fallbackSelector = config[`${fieldName}Fallback`];
|
||||||
|
const attribute = config[`${fieldName}Attribute`];
|
||||||
|
const fallbackAttribute = config[`${fieldName}FallbackAttribute`];
|
||||||
|
|
||||||
|
// Try primary selector first
|
||||||
|
if (primarySelector) {
|
||||||
|
const result = await this.extractField($, primarySelector, html, attribute);
|
||||||
|
if (result && result.trim()) {
|
||||||
|
return result.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try fallback selector if primary failed
|
||||||
|
if (fallbackSelector) {
|
||||||
|
const result = await this.extractField($, fallbackSelector, html, fallbackAttribute);
|
||||||
|
if (result && result.trim()) {
|
||||||
|
return result.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractField(
|
||||||
|
$: any,
|
||||||
|
selector: string | SelectorStrategy,
|
||||||
|
html: string,
|
||||||
|
attribute?: string
|
||||||
|
): Promise<any> {
|
||||||
|
if (typeof selector === 'string') {
|
||||||
|
// Simple CSS selector - always return single value (first element)
|
||||||
|
const element = $(selector).first();
|
||||||
|
if (attribute) {
|
||||||
|
// Extract specific attribute instead of text
|
||||||
|
return element.attr(attribute) || '';
|
||||||
|
}
|
||||||
|
return element.text().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy-based extraction
|
||||||
|
return await this.executeStrategy($, selector, html);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractTags(
|
||||||
|
$: any,
|
||||||
|
selector: string | SelectorStrategy,
|
||||||
|
html: string,
|
||||||
|
attribute?: string
|
||||||
|
): Promise<any> {
|
||||||
|
if (typeof selector === 'string') {
|
||||||
|
// Simple CSS selector - collect ALL matching elements for tags
|
||||||
|
const elements = $(selector);
|
||||||
|
|
||||||
|
if (elements.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const results: string[] = [];
|
||||||
|
elements.each((_: any, elem: any) => {
|
||||||
|
const $elem = $(elem);
|
||||||
|
const value = attribute ? $elem.attr(attribute) : $elem.text().trim();
|
||||||
|
if (value) {
|
||||||
|
results.push(value);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy-based extraction for tags
|
||||||
|
return await this.executeStrategy($, selector, html);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async executeStrategy(
|
||||||
|
$: any,
|
||||||
|
strategy: SelectorStrategy,
|
||||||
|
html: string
|
||||||
|
): Promise<any> {
|
||||||
|
switch (strategy.strategy) {
|
||||||
|
case 'text-pattern':
|
||||||
|
return extractByTextPattern(html, strategy as any);
|
||||||
|
case 'link-with-path':
|
||||||
|
return extractLinkWithPath($, strategy as any);
|
||||||
|
case 'text-blocks':
|
||||||
|
return extractTextBlocks($, strategy as any);
|
||||||
|
case 'href-pattern':
|
||||||
|
return extractHrefPattern($, strategy as any);
|
||||||
|
case 'html-between':
|
||||||
|
return extractHtmlBetween(html, strategy as any);
|
||||||
|
case 'link-text':
|
||||||
|
return extractLinkText($, strategy as any);
|
||||||
|
case 'first-image':
|
||||||
|
return extractFirstImage($, strategy as any);
|
||||||
|
case 'responsive-image':
|
||||||
|
return extractResponsiveImage($, strategy as any);
|
||||||
|
case 'lazy-loaded':
|
||||||
|
return extractLazyLoadedImage($, strategy as any);
|
||||||
|
case 'chapters':
|
||||||
|
return extractChapters($, strategy as any);
|
||||||
|
case 'chapter-content':
|
||||||
|
return extractChapterContent($, strategy as any);
|
||||||
|
case 'multiple-types':
|
||||||
|
return extractMultipleTypes($, strategy as any);
|
||||||
|
case 'schema-org':
|
||||||
|
return extractSchemaOrg($, strategy as any);
|
||||||
|
case 'react-content':
|
||||||
|
return extractReactContent($, strategy as any);
|
||||||
|
default:
|
||||||
|
throw new Error(`Unknown strategy: ${strategy.strategy}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractContent(
|
||||||
|
$: any,
|
||||||
|
storyConfig: any,
|
||||||
|
url: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string> {
|
||||||
|
let content = await this.extractField($, storyConfig.content, html);
|
||||||
|
|
||||||
|
if (storyConfig.multiPage?.enabled) {
|
||||||
|
const additionalPages = await this.fetchAdditionalPages(
|
||||||
|
$,
|
||||||
|
url,
|
||||||
|
storyConfig.multiPage
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const pageHtml of additionalPages) {
|
||||||
|
const cheerioPage = await import('cheerio');
|
||||||
|
const $page = cheerioPage.load(pageHtml);
|
||||||
|
const pageContent = await this.extractField(
|
||||||
|
$page,
|
||||||
|
storyConfig.content,
|
||||||
|
pageHtml
|
||||||
|
);
|
||||||
|
content += '\n\n' + pageContent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async fetchAdditionalPages(
|
||||||
|
$: any,
|
||||||
|
baseUrl: string,
|
||||||
|
config: MultiPageConfig
|
||||||
|
): Promise<string[]> {
|
||||||
|
const pages: string[] = [];
|
||||||
|
let currentUrl = baseUrl;
|
||||||
|
let pageNum = 2;
|
||||||
|
|
||||||
|
while (pageNum <= (config.maxPages || 20)) {
|
||||||
|
let nextUrl: string | null = null;
|
||||||
|
|
||||||
|
if (config.strategy === 'url-pattern') {
|
||||||
|
nextUrl = UrlParser.buildPageUrl(baseUrl, pageNum, config);
|
||||||
|
} else if (config.nextPageSelector) {
|
||||||
|
const nextLink = $(config.nextPageSelector).attr('href');
|
||||||
|
if (nextLink) {
|
||||||
|
nextUrl = UrlParser.normalizeUrl(nextLink, currentUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!nextUrl) break;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.rateLimiter.throttle();
|
||||||
|
const html = await this.fetchWithCache(nextUrl);
|
||||||
|
pages.push(html);
|
||||||
|
currentUrl = nextUrl;
|
||||||
|
pageNum++;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to fetch page ${pageNum}:`, error);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async fetchWithCache(url: string): Promise<string> {
|
||||||
|
const cached = this.cache.get(url);
|
||||||
|
if (cached) {
|
||||||
|
return cached;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.rateLimiter.throttle();
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': this.config.globalOptions.userAgent,
|
||||||
|
},
|
||||||
|
signal: AbortSignal.timeout(this.config.globalOptions.timeout)
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await response.text();
|
||||||
|
this.cache.set(url, html);
|
||||||
|
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
private applyTransforms(text: string, transform?: string): string {
|
||||||
|
if (!transform) return text;
|
||||||
|
|
||||||
|
if (transform.startsWith('remove-suffix:')) {
|
||||||
|
const suffix = transform.substring('remove-suffix:'.length).trim();
|
||||||
|
return text.replace(new RegExp(`${suffix}$`, 'i'), '').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
}
|
||||||
164
frontend/src/lib/scraper/strategies/contentCleaner.ts
Normal file
164
frontend/src/lib/scraper/strategies/contentCleaner.ts
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||||
|
// Using any type for CheerioAPI to prevent bundling issues
|
||||||
|
import {
|
||||||
|
ChaptersStrategy,
|
||||||
|
ChapterContentStrategy,
|
||||||
|
MultipleTypesStrategy,
|
||||||
|
SchemaOrgStrategy,
|
||||||
|
ReactContentStrategy
|
||||||
|
} from '../types';
|
||||||
|
|
||||||
|
export function extractChapters(
|
||||||
|
$: any,
|
||||||
|
config: ChaptersStrategy
|
||||||
|
): string {
|
||||||
|
// Check for multiple chapters first
|
||||||
|
if (config.chaptersWrapper) {
|
||||||
|
const chaptersWrapper = $(config.chaptersWrapper);
|
||||||
|
if (chaptersWrapper.length > 0) {
|
||||||
|
const chapters = chaptersWrapper.find(config.chapterSelector);
|
||||||
|
if (chapters.length > 1) {
|
||||||
|
// Multiple chapters - combine them
|
||||||
|
let content = '';
|
||||||
|
chapters.each((_: any, elem: any) => {
|
||||||
|
content += $(elem).html() + '\n\n';
|
||||||
|
});
|
||||||
|
return content.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Single chapter fallback
|
||||||
|
if (config.singleChapter) {
|
||||||
|
const singleChapter = $(config.singleChapter);
|
||||||
|
if (singleChapter.length > 0) {
|
||||||
|
return singleChapter.html() || '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Direct chapter selector fallback
|
||||||
|
const chapter = $(config.chapterSelector).first();
|
||||||
|
return chapter.html() || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractChapterContent(
|
||||||
|
$: any,
|
||||||
|
config: ChapterContentStrategy
|
||||||
|
): string {
|
||||||
|
const content = $(config.selector);
|
||||||
|
|
||||||
|
// Remove cleanup selectors
|
||||||
|
if (config.cleanupSelectors) {
|
||||||
|
config.cleanupSelectors.forEach(selector => {
|
||||||
|
content.find(selector).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return content.html() || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractMultipleTypes(
|
||||||
|
$: any,
|
||||||
|
config: MultipleTypesStrategy
|
||||||
|
): string[] {
|
||||||
|
const tags: string[] = [];
|
||||||
|
|
||||||
|
Object.entries(config.selectors).forEach(([type, selector]) => {
|
||||||
|
$(selector).each((_: any, elem: any) => {
|
||||||
|
const tag = $(elem).text().trim();
|
||||||
|
if (tag) {
|
||||||
|
tags.push(`${type}: ${tag}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return tags;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractSchemaOrg(
|
||||||
|
$: any,
|
||||||
|
config: SchemaOrgStrategy
|
||||||
|
): string {
|
||||||
|
// Look for JSON-LD first
|
||||||
|
$('script[type="application/ld+json"]').each((_: any, elem: any) => {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse($(elem).html() || '');
|
||||||
|
if (data['@type'] === config.schemaType ||
|
||||||
|
(Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
|
||||||
|
const item = Array.isArray(data) ?
|
||||||
|
data.find(item => item['@type'] === config.schemaType) : data;
|
||||||
|
if (item && item[config.property]) {
|
||||||
|
return item[config.property];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Invalid JSON, continue
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fallback to selector
|
||||||
|
if (config.fallbackSelector) {
|
||||||
|
return $(config.fallbackSelector).first().text().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractReactContent(
|
||||||
|
$: any,
|
||||||
|
config: ReactContentStrategy
|
||||||
|
): string {
|
||||||
|
// This is a simplified version - full React content extraction
|
||||||
|
// would require JavaScript execution or API access
|
||||||
|
|
||||||
|
const contentElements = $(config.paragraphSelector);
|
||||||
|
let content = '';
|
||||||
|
|
||||||
|
contentElements.each((_: any, elem: any) => {
|
||||||
|
const $elem = $(elem);
|
||||||
|
if ($elem.hasClass(config.contentClass)) {
|
||||||
|
content += $elem.html() + '\n\n';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return content.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function cleanHtml(html: string): Promise<string> {
|
||||||
|
// Basic HTML cleaning - remove scripts, styles, and dangerous elements
|
||||||
|
const cheerio = await import('cheerio');
|
||||||
|
const $ = cheerio.load(html, {
|
||||||
|
// Preserve self-closing tags like <br>
|
||||||
|
xmlMode: false,
|
||||||
|
decodeEntities: false
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove dangerous elements
|
||||||
|
$('script, style, iframe, embed, object').remove();
|
||||||
|
|
||||||
|
// Remove empty paragraphs and divs (but preserve <br> tags)
|
||||||
|
$('p:empty, div:empty').not(':has(br)').remove();
|
||||||
|
|
||||||
|
// Clean up excessive whitespace in text nodes only, preserve <br> tags
|
||||||
|
$('*').each((_, elem) => {
|
||||||
|
const $elem = $(elem);
|
||||||
|
if (elem.type === 'text') {
|
||||||
|
const text = $elem.text();
|
||||||
|
if (text && text.trim() !== text) {
|
||||||
|
$elem.replaceWith(text.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Return HTML with proper self-closing tag format
|
||||||
|
return $.html() || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractAttribute(
|
||||||
|
$: any,
|
||||||
|
selector: string,
|
||||||
|
attribute: string
|
||||||
|
): string {
|
||||||
|
const element = $(selector).first();
|
||||||
|
return element.attr(attribute) || '';
|
||||||
|
}
|
||||||
3
frontend/src/lib/scraper/strategies/index.ts
Normal file
3
frontend/src/lib/scraper/strategies/index.ts
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
export * from './textExtractor';
|
||||||
|
export * from './linkExtractor';
|
||||||
|
export * from './contentCleaner';
|
||||||
98
frontend/src/lib/scraper/strategies/linkExtractor.ts
Normal file
98
frontend/src/lib/scraper/strategies/linkExtractor.ts
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||||
|
// Using any type for CheerioAPI to prevent bundling issues
|
||||||
|
import {
|
||||||
|
LinkWithPathStrategy,
|
||||||
|
HrefPatternStrategy,
|
||||||
|
FirstImageStrategy,
|
||||||
|
ResponsiveImageStrategy,
|
||||||
|
LazyLoadedStrategy
|
||||||
|
} from '../types';
|
||||||
|
|
||||||
|
export function extractLinkWithPath(
|
||||||
|
$: any,
|
||||||
|
config: LinkWithPathStrategy
|
||||||
|
): string {
|
||||||
|
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||||
|
|
||||||
|
const links = searchScope.find('a');
|
||||||
|
|
||||||
|
for (let i = 0; i < links.length; i++) {
|
||||||
|
const link = links.eq(i);
|
||||||
|
const href = link.attr('href');
|
||||||
|
|
||||||
|
if (href && href.includes(config.pathContains)) {
|
||||||
|
return link.text().trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractHrefPattern(
|
||||||
|
$: any,
|
||||||
|
config: HrefPatternStrategy
|
||||||
|
): string[] {
|
||||||
|
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||||
|
|
||||||
|
const pattern = new RegExp(config.pattern);
|
||||||
|
const links: string[] = [];
|
||||||
|
|
||||||
|
searchScope.find('a').each((_: any, elem: any) => {
|
||||||
|
const href = $(elem).attr('href');
|
||||||
|
if (href && pattern.test(href)) {
|
||||||
|
links.push(href);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractFirstImage(
|
||||||
|
$: any,
|
||||||
|
config: FirstImageStrategy
|
||||||
|
): string {
|
||||||
|
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||||
|
|
||||||
|
const img = searchScope.find('img').first();
|
||||||
|
return img.attr(config.attribute) || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractResponsiveImage(
|
||||||
|
$: any,
|
||||||
|
config: ResponsiveImageStrategy
|
||||||
|
): string {
|
||||||
|
const img = $(config.selector).first();
|
||||||
|
|
||||||
|
if (config.selectLargest && config.srcsetAttribute) {
|
||||||
|
const srcset = img.attr(config.srcsetAttribute);
|
||||||
|
if (srcset) {
|
||||||
|
// Parse srcset and return the largest image
|
||||||
|
const sources = srcset.split(',').map((src: string) => {
|
||||||
|
const parts = src.trim().split(' ');
|
||||||
|
const url = parts[0];
|
||||||
|
const descriptor = parts[1] || '1x';
|
||||||
|
const width = descriptor.includes('w') ?
|
||||||
|
parseInt(descriptor.replace('w', '')) :
|
||||||
|
descriptor.includes('x') ?
|
||||||
|
parseInt(descriptor.replace('x', '')) * 100 : 100;
|
||||||
|
return { url, width };
|
||||||
|
});
|
||||||
|
|
||||||
|
const largest = sources.reduce((prev: any, current: any) =>
|
||||||
|
prev.width > current.width ? prev : current
|
||||||
|
);
|
||||||
|
|
||||||
|
return largest.url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return img.attr('src') || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractLazyLoadedImage(
|
||||||
|
$: any,
|
||||||
|
config: LazyLoadedStrategy
|
||||||
|
): string {
|
||||||
|
const img = $(config.selector).first();
|
||||||
|
return img.attr(config.attribute) || img.attr('src') || '';
|
||||||
|
}
|
||||||
144
frontend/src/lib/scraper/strategies/textExtractor.ts
Normal file
144
frontend/src/lib/scraper/strategies/textExtractor.ts
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
import 'server-only';
|
||||||
|
|
||||||
|
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||||
|
// Using any type for CheerioAPI to prevent bundling issues
|
||||||
|
import {
|
||||||
|
TextPatternStrategy,
|
||||||
|
TextBlockStrategy,
|
||||||
|
HtmlBetweenStrategy,
|
||||||
|
LinkTextStrategy
|
||||||
|
} from '../types';
|
||||||
|
|
||||||
|
export function extractByTextPattern(
|
||||||
|
html: string,
|
||||||
|
config: TextPatternStrategy
|
||||||
|
): string {
|
||||||
|
let searchContent = html;
|
||||||
|
|
||||||
|
// Limit search scope if specified
|
||||||
|
if (config.searchAfter) {
|
||||||
|
const afterIndex = html.indexOf(config.searchAfter);
|
||||||
|
if (afterIndex !== -1) {
|
||||||
|
searchContent = html.substring(afterIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.searchBefore) {
|
||||||
|
const beforeIndex = searchContent.indexOf(config.searchBefore);
|
||||||
|
if (beforeIndex !== -1) {
|
||||||
|
searchContent = searchContent.substring(0, beforeIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const regex = new RegExp(config.pattern, 'i');
|
||||||
|
const match = searchContent.match(regex);
|
||||||
|
return match ? match[config.group || 1].trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractTextBlocks(
|
||||||
|
$: any,
|
||||||
|
config: TextBlockStrategy
|
||||||
|
): string {
|
||||||
|
const blocks: Array<{element: any, text: string}> = [];
|
||||||
|
|
||||||
|
// Remove excluded elements first
|
||||||
|
if (config.excludeSelectors) {
|
||||||
|
config.excludeSelectors.forEach(selector => {
|
||||||
|
$(selector).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
$('*').each((_: any, elem: any) => {
|
||||||
|
const $elem = $(elem);
|
||||||
|
const text = $elem.clone().children().remove().end().text().trim();
|
||||||
|
|
||||||
|
if (text.length >= (config.minLength || 500)) {
|
||||||
|
blocks.push({ element: elem, text });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find the block that likely contains story content
|
||||||
|
const storyBlock = blocks.find(block => {
|
||||||
|
if (config.containerHints && config.containerHints.length > 0) {
|
||||||
|
const hasHints = config.containerHints.some(hint =>
|
||||||
|
$(block.element).attr('class')?.includes(hint) ||
|
||||||
|
$(block.element).attr('id')?.includes(hint)
|
||||||
|
);
|
||||||
|
return hasHints;
|
||||||
|
}
|
||||||
|
return blocks.length === 1;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (storyBlock) {
|
||||||
|
return $(storyBlock.element).html() || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to largest block
|
||||||
|
const largestBlock = blocks.reduce((prev, current) =>
|
||||||
|
prev.text.length > current.text.length ? prev : current
|
||||||
|
);
|
||||||
|
|
||||||
|
return largestBlock ? $(largestBlock.element).html() || '' : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractHtmlBetween(
|
||||||
|
html: string,
|
||||||
|
config: HtmlBetweenStrategy
|
||||||
|
): string {
|
||||||
|
const startIndex = html.indexOf(config.startMarker);
|
||||||
|
if (startIndex === -1) return '';
|
||||||
|
|
||||||
|
const contentStart = config.includeStart ?
|
||||||
|
startIndex :
|
||||||
|
startIndex + config.startMarker.length;
|
||||||
|
|
||||||
|
const endIndex = html.indexOf(config.endMarker, contentStart);
|
||||||
|
if (endIndex === -1) {
|
||||||
|
return html.substring(contentStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
return html.substring(contentStart, endIndex).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractLinkText(
|
||||||
|
$: any,
|
||||||
|
config: LinkTextStrategy
|
||||||
|
): string {
|
||||||
|
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||||
|
|
||||||
|
// Look for links near the specified text patterns
|
||||||
|
let foundText = '';
|
||||||
|
|
||||||
|
config.nearText.forEach(text => {
|
||||||
|
if (foundText) return; // Already found
|
||||||
|
|
||||||
|
searchScope.find('*').each((_: any, elem: any) => {
|
||||||
|
const $elem = $(elem);
|
||||||
|
const elemText = $elem.text().toLowerCase();
|
||||||
|
|
||||||
|
if (elemText.includes(text.toLowerCase())) {
|
||||||
|
// Look for nearby links
|
||||||
|
const $link = $elem.find('a').first();
|
||||||
|
if ($link.length) {
|
||||||
|
foundText = $link.text().trim();
|
||||||
|
return false; // Break out of each
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the element itself is a link
|
||||||
|
if ($elem.is('a')) {
|
||||||
|
foundText = $elem.text().trim();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for links in the next few siblings
|
||||||
|
const $siblings = $elem.nextAll().slice(0, 3);
|
||||||
|
$siblings.find('a').first().each((_: any, link: any) => {
|
||||||
|
foundText = $(link).text().trim();
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return foundText;
|
||||||
|
}
|
||||||
248
frontend/src/lib/scraper/types.ts
Normal file
248
frontend/src/lib/scraper/types.ts
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
export interface SiteConfig {
|
||||||
|
story: StorySelectors;
|
||||||
|
authorPage: AuthorPageSelectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StorySelectors {
|
||||||
|
title: string | SelectorStrategy;
|
||||||
|
author: string | SelectorStrategy;
|
||||||
|
content: string | SelectorStrategy;
|
||||||
|
summary?: string | SelectorStrategy;
|
||||||
|
coverImage?: string | SelectorStrategy;
|
||||||
|
tags?: string | SelectorStrategy;
|
||||||
|
multiPage?: MultiPageConfig;
|
||||||
|
titleFallback?: string;
|
||||||
|
titleFallbackAttribute?: string;
|
||||||
|
titleTransform?: string;
|
||||||
|
summaryAttribute?: string;
|
||||||
|
coverImageAttribute?: string;
|
||||||
|
tagsAttribute?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AuthorPageSelectors {
|
||||||
|
storyLinks: string | SelectorStrategy;
|
||||||
|
pagination?: PaginationConfig;
|
||||||
|
linkPrefix?: string;
|
||||||
|
filterStrategy?: string;
|
||||||
|
requiresChildElement?: string;
|
||||||
|
requiresNavigation?: NavigationConfig;
|
||||||
|
metadata?: MetadataConfig;
|
||||||
|
additionalInfo?: AdditionalInfoConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SelectorStrategy {
|
||||||
|
strategy: string;
|
||||||
|
[key: string]: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MultiPageConfig {
|
||||||
|
enabled: boolean;
|
||||||
|
strategy: 'url-pattern' | 'next-link' | 'chapter-navigation' | 'chapter-dropdown' | 'table-of-contents' | 'api-based';
|
||||||
|
nextPageSelector?: string;
|
||||||
|
pageParam?: string;
|
||||||
|
maxPages?: number;
|
||||||
|
chapterListSelector?: string;
|
||||||
|
chapterSelector?: string;
|
||||||
|
urlPattern?: string;
|
||||||
|
tocSelector?: string;
|
||||||
|
requiresAuth?: boolean;
|
||||||
|
apiPattern?: string;
|
||||||
|
tocApiPattern?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PaginationConfig {
|
||||||
|
enabled: boolean;
|
||||||
|
nextPageSelector: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface NavigationConfig {
|
||||||
|
enabled: boolean;
|
||||||
|
clickText: string;
|
||||||
|
waitMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MetadataConfig {
|
||||||
|
strategy: string;
|
||||||
|
metadataSelector: string;
|
||||||
|
parsePattern: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AdditionalInfoConfig {
|
||||||
|
strategy: string;
|
||||||
|
statsSelector: string;
|
||||||
|
extractStats: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScrapedStory {
|
||||||
|
title: string;
|
||||||
|
author: string;
|
||||||
|
content: string;
|
||||||
|
summary?: string;
|
||||||
|
coverImage?: string;
|
||||||
|
tags?: string[];
|
||||||
|
sourceUrl: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScrapedAuthorStory {
|
||||||
|
url: string;
|
||||||
|
title: string;
|
||||||
|
author: string;
|
||||||
|
summary?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SitesConfig {
|
||||||
|
sites: Record<string, SiteConfig>;
|
||||||
|
strategies: Record<string, StrategyDescription>;
|
||||||
|
globalOptions: GlobalOptions;
|
||||||
|
siteNotes?: Record<string, SiteNotes>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StrategyDescription {
|
||||||
|
description: string;
|
||||||
|
implementation: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface GlobalOptions {
|
||||||
|
userAgent: string;
|
||||||
|
timeout: number;
|
||||||
|
retryAttempts: number;
|
||||||
|
rateLimitMs: number;
|
||||||
|
cacheDuration?: number;
|
||||||
|
javascriptTimeout?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SiteNotes {
|
||||||
|
warning?: string;
|
||||||
|
note?: string;
|
||||||
|
rateLimit?: string;
|
||||||
|
requiresAuth?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy-specific interfaces
|
||||||
|
export interface TextPatternStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'text-pattern';
|
||||||
|
pattern: string;
|
||||||
|
group?: number;
|
||||||
|
searchAfter?: string;
|
||||||
|
searchBefore?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LinkWithPathStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'link-with-path';
|
||||||
|
pathContains: string;
|
||||||
|
searchWithin?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TextBlockStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'text-blocks';
|
||||||
|
minLength?: number;
|
||||||
|
containerHints?: string[];
|
||||||
|
excludeSelectors?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface HrefPatternStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'href-pattern';
|
||||||
|
pattern: string;
|
||||||
|
searchWithin?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface HtmlBetweenStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'html-between';
|
||||||
|
startMarker: string;
|
||||||
|
endMarker: string;
|
||||||
|
includeStart?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ChaptersStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'chapters';
|
||||||
|
chapterSelector: string;
|
||||||
|
chaptersWrapper?: string;
|
||||||
|
singleChapter?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MultipleTypesStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'multiple-types';
|
||||||
|
selectors: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LinkTextStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'link-text';
|
||||||
|
nearText: string[];
|
||||||
|
searchWithin?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FirstImageStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'first-image';
|
||||||
|
searchWithin: string;
|
||||||
|
attribute: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SchemaOrgStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'schema-org';
|
||||||
|
schemaType: string;
|
||||||
|
property: string;
|
||||||
|
fallbackSelector?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ReactContentStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'react-content';
|
||||||
|
contentClass: string;
|
||||||
|
paragraphSelector: string;
|
||||||
|
requiresJavaScript: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ResponsiveImageStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'responsive-image';
|
||||||
|
selector: string;
|
||||||
|
srcsetAttribute: string;
|
||||||
|
selectLargest: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LazyLoadedStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'lazy-loaded';
|
||||||
|
selector: string;
|
||||||
|
attribute: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ChapterContentStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'chapter-content';
|
||||||
|
selector: string;
|
||||||
|
cleanupSelectors?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DataAttributesStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'data-attributes';
|
||||||
|
statsSelector: string;
|
||||||
|
extractStats: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SiblingTextStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'sibling-text';
|
||||||
|
metadataSelector: string;
|
||||||
|
parsePattern: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ApiBasedStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'api-based';
|
||||||
|
apiPattern: string;
|
||||||
|
tocApiPattern?: string;
|
||||||
|
requiresAuth: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface InfiniteScrollStrategy extends SelectorStrategy {
|
||||||
|
strategy: 'infinite-scroll';
|
||||||
|
initialSelector: string;
|
||||||
|
apiEndpoint: string;
|
||||||
|
requiresJavaScript: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class ScraperError extends Error {
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
public url: string,
|
||||||
|
public originalError?: Error
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = 'ScraperError';
|
||||||
|
}
|
||||||
|
}
|
||||||
35
frontend/src/lib/scraper/utils/cache.ts
Normal file
35
frontend/src/lib/scraper/utils/cache.ts
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
export class ScraperCache {
|
||||||
|
private cache: Map<string, { data: any; timestamp: number }> = new Map();
|
||||||
|
private ttl: number;
|
||||||
|
|
||||||
|
constructor(ttlMs: number = 300000) { // 5 minutes default
|
||||||
|
this.ttl = ttlMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
get(key: string): any | null {
|
||||||
|
const entry = this.cache.get(key);
|
||||||
|
if (!entry) return null;
|
||||||
|
|
||||||
|
if (Date.now() - entry.timestamp > this.ttl) {
|
||||||
|
this.cache.delete(key);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return entry.data;
|
||||||
|
}
|
||||||
|
|
||||||
|
set(key: string, data: any): void {
|
||||||
|
this.cache.set(key, {
|
||||||
|
data,
|
||||||
|
timestamp: Date.now()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
clear(): void {
|
||||||
|
this.cache.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
size(): number {
|
||||||
|
return this.cache.size;
|
||||||
|
}
|
||||||
|
}
|
||||||
23
frontend/src/lib/scraper/utils/rateLimit.ts
Normal file
23
frontend/src/lib/scraper/utils/rateLimit.ts
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
export class RateLimiter {
|
||||||
|
private lastRequest: number = 0;
|
||||||
|
private minDelay: number;
|
||||||
|
|
||||||
|
constructor(minDelayMs: number = 1000) {
|
||||||
|
this.minDelay = minDelayMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
async throttle(): Promise<void> {
|
||||||
|
const now = Date.now();
|
||||||
|
const timeSinceLastRequest = now - this.lastRequest;
|
||||||
|
|
||||||
|
if (timeSinceLastRequest < this.minDelay) {
|
||||||
|
await this.delay(this.minDelay - timeSinceLastRequest);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.lastRequest = Date.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
private delay(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
}
|
||||||
61
frontend/src/lib/scraper/utils/urlParser.ts
Normal file
61
frontend/src/lib/scraper/utils/urlParser.ts
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
export class UrlParser {
|
||||||
|
static getDomain(url: string): string {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
return urlObj.hostname.replace(/^www\./, '');
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`Invalid URL: ${url}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static validateUrl(url: string): boolean {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static buildPageUrl(baseUrl: string, pageNum: number, config: any): string {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(baseUrl);
|
||||||
|
if (config.pageParam) {
|
||||||
|
urlObj.searchParams.set(config.pageParam, pageNum.toString());
|
||||||
|
} else if (config.urlPattern) {
|
||||||
|
// Replace {page} or similar patterns in URL
|
||||||
|
return config.urlPattern.replace(/\{page\}/g, pageNum.toString());
|
||||||
|
}
|
||||||
|
return urlObj.toString();
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`Failed to build page URL: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static normalizeUrl(url: string, baseUrl?: string): string {
|
||||||
|
try {
|
||||||
|
if (url.startsWith('http://') || url.startsWith('https://')) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (baseUrl) {
|
||||||
|
return new URL(url, baseUrl).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
return url;
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`Failed to normalize URL: ${url}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static extractDomainConfig(url: string, sitesConfig: any): any {
|
||||||
|
const domain = this.getDomain(url);
|
||||||
|
const config = sitesConfig.sites[domain];
|
||||||
|
|
||||||
|
if (!config) {
|
||||||
|
throw new Error(`Unsupported site: ${domain}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because one or more lines are too long
18
nginx.conf
18
nginx.conf
@@ -28,7 +28,23 @@ http {
|
|||||||
proxy_cache_bypass $http_upgrade;
|
proxy_cache_bypass $http_upgrade;
|
||||||
}
|
}
|
||||||
|
|
||||||
# Backend API routes
|
# Scraping routes - completely separate from /api/ to avoid conflicts
|
||||||
|
location /scrape/ {
|
||||||
|
proxy_pass http://frontend/scrape/;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection 'upgrade';
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_cache_bypass $http_upgrade;
|
||||||
|
proxy_connect_timeout 60s;
|
||||||
|
proxy_send_timeout 60s;
|
||||||
|
proxy_read_timeout 60s;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Backend API routes (fallback for all other /api/ routes)
|
||||||
location /api/ {
|
location /api/ {
|
||||||
proxy_pass http://backend/api/;
|
proxy_pass http://backend/api/;
|
||||||
proxy_set_header Host $host;
|
proxy_set_header Host $host;
|
||||||
|
|||||||
304
package-lock.json
generated
304
package-lock.json
generated
@@ -2,5 +2,307 @@
|
|||||||
"name": "StoryCove",
|
"name": "StoryCove",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {}
|
"packages": {
|
||||||
|
"": {
|
||||||
|
"dependencies": {
|
||||||
|
"cheerio": "^1.1.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/boolbase": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==",
|
||||||
|
"license": "ISC"
|
||||||
|
},
|
||||||
|
"node_modules/cheerio": {
|
||||||
|
"version": "1.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz",
|
||||||
|
"integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"cheerio-select": "^2.1.0",
|
||||||
|
"dom-serializer": "^2.0.0",
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"domutils": "^3.2.2",
|
||||||
|
"encoding-sniffer": "^0.2.1",
|
||||||
|
"htmlparser2": "^10.0.0",
|
||||||
|
"parse5": "^7.3.0",
|
||||||
|
"parse5-htmlparser2-tree-adapter": "^7.1.0",
|
||||||
|
"parse5-parser-stream": "^7.1.2",
|
||||||
|
"undici": "^7.12.0",
|
||||||
|
"whatwg-mimetype": "^4.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=20.18.1"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/cheerio-select": {
|
||||||
|
"version": "2.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
|
||||||
|
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"boolbase": "^1.0.0",
|
||||||
|
"css-select": "^5.1.0",
|
||||||
|
"css-what": "^6.1.0",
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"domutils": "^3.0.1"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/css-select": {
|
||||||
|
"version": "5.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
|
||||||
|
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"boolbase": "^1.0.0",
|
||||||
|
"css-what": "^6.1.0",
|
||||||
|
"domhandler": "^5.0.2",
|
||||||
|
"domutils": "^3.0.1",
|
||||||
|
"nth-check": "^2.0.1"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/css-what": {
|
||||||
|
"version": "6.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
|
||||||
|
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 6"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/dom-serializer": {
|
||||||
|
"version": "2.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
||||||
|
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.2",
|
||||||
|
"entities": "^4.2.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/domelementtype": {
|
||||||
|
"version": "2.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
||||||
|
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "BSD-2-Clause"
|
||||||
|
},
|
||||||
|
"node_modules/domhandler": {
|
||||||
|
"version": "5.0.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
||||||
|
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"domelementtype": "^2.3.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 4"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/domutils": {
|
||||||
|
"version": "3.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
|
||||||
|
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"dom-serializer": "^2.0.0",
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.3"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/encoding-sniffer": {
|
||||||
|
"version": "0.2.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz",
|
||||||
|
"integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"iconv-lite": "^0.6.3",
|
||||||
|
"whatwg-encoding": "^3.1.1"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/encoding-sniffer?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/entities": {
|
||||||
|
"version": "4.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
|
||||||
|
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/htmlparser2": {
|
||||||
|
"version": "10.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
|
||||||
|
"integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==",
|
||||||
|
"funding": [
|
||||||
|
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/fb55"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"domelementtype": "^2.3.0",
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"domutils": "^3.2.1",
|
||||||
|
"entities": "^6.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/htmlparser2/node_modules/entities": {
|
||||||
|
"version": "6.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||||
|
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/iconv-lite": {
|
||||||
|
"version": "0.6.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||||
|
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.10.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/nth-check": {
|
||||||
|
"version": "2.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
||||||
|
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"dependencies": {
|
||||||
|
"boolbase": "^1.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/parse5": {
|
||||||
|
"version": "7.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
|
||||||
|
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"entities": "^6.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/parse5-htmlparser2-tree-adapter": {
|
||||||
|
"version": "7.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
|
||||||
|
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"parse5": "^7.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/parse5-parser-stream": {
|
||||||
|
"version": "7.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz",
|
||||||
|
"integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"parse5": "^7.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/parse5/node_modules/entities": {
|
||||||
|
"version": "6.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||||
|
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/safer-buffer": {
|
||||||
|
"version": "2.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||||
|
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
|
"node_modules/undici": {
|
||||||
|
"version": "7.12.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/undici/-/undici-7.12.0.tgz",
|
||||||
|
"integrity": "sha512-GrKEsc3ughskmGA9jevVlIOPMiiAHJ4OFUtaAH+NhfTUSiZ1wMPIQqQvAJUrJspFXJt3EBWgpAeoHEDVT1IBug==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=20.18.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/whatwg-encoding": {
|
||||||
|
"version": "3.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
||||||
|
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"iconv-lite": "0.6.3"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/whatwg-mimetype": {
|
||||||
|
"version": "4.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
||||||
|
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
474
storycove-scraper-spec.md
Normal file
474
storycove-scraper-spec.md
Normal file
@@ -0,0 +1,474 @@
|
|||||||
|
# StoryCove Web Scraper Feature Specification
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.
|
||||||
|
|
||||||
|
## Feature Requirements
|
||||||
|
|
||||||
|
### Core Functionality
|
||||||
|
|
||||||
|
1. **Single Story Import**: Users can provide a story URL and the scraper will extract:
|
||||||
|
- Title (required)
|
||||||
|
- Author (required)
|
||||||
|
- Content (required)
|
||||||
|
- Summary (optional)
|
||||||
|
- Cover Image (optional)
|
||||||
|
- Tags (optional)
|
||||||
|
|
||||||
|
2. **Author Page Scanning**: Users can provide an author page URL to:
|
||||||
|
- Discover all stories by that author
|
||||||
|
- Present a selectable list of stories
|
||||||
|
- Allow bulk import of selected stories
|
||||||
|
|
||||||
|
3. **Multi-page Story Support**: Handle stories split across multiple pages by:
|
||||||
|
- Detecting pagination
|
||||||
|
- Fetching all pages
|
||||||
|
- Merging content in correct order
|
||||||
|
|
||||||
|
### User Interface Flow
|
||||||
|
|
||||||
|
1. **Add Story View Enhancement**:
|
||||||
|
```
|
||||||
|
[Manual Entry] | [Import from URL]
|
||||||
|
|
||||||
|
When "Import from URL" selected:
|
||||||
|
- URL input field
|
||||||
|
- "Fetch" button
|
||||||
|
- Loading indicator during fetch
|
||||||
|
- Pre-filled form with scraped data
|
||||||
|
- Ability to edit before saving
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Bulk Import View** (future enhancement):
|
||||||
|
```
|
||||||
|
- URL input for author page
|
||||||
|
- "Scan for Stories" button
|
||||||
|
- Checkbox list of discovered stories
|
||||||
|
- "Import Selected" button
|
||||||
|
- Progress indicator
|
||||||
|
```
|
||||||
|
|
||||||
|
## Technical Implementation
|
||||||
|
|
||||||
|
### Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
/lib/scraper/
|
||||||
|
├── config/
|
||||||
|
│ └── sites.json # Site configurations
|
||||||
|
├── scraper.ts # Main scraper class
|
||||||
|
├── strategies/ # Strategy implementations
|
||||||
|
│ ├── index.ts
|
||||||
|
│ ├── textExtractor.ts
|
||||||
|
│ ├── linkExtractor.ts
|
||||||
|
│ └── contentCleaner.ts
|
||||||
|
├── utils/
|
||||||
|
│ ├── rateLimit.ts
|
||||||
|
│ ├── cache.ts
|
||||||
|
│ └── urlParser.ts
|
||||||
|
└── types.ts # TypeScript definitions
|
||||||
|
```
|
||||||
|
|
||||||
|
### API Routes
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /app/api/scrape/story/route.ts
|
||||||
|
POST /api/scrape/story
|
||||||
|
Body: { url: string }
|
||||||
|
Response: {
|
||||||
|
title: string,
|
||||||
|
author: string,
|
||||||
|
content: string,
|
||||||
|
summary?: string,
|
||||||
|
coverImage?: string,
|
||||||
|
tags?: string[]
|
||||||
|
}
|
||||||
|
|
||||||
|
// /app/api/scrape/author/route.ts
|
||||||
|
POST /api/scrape/author
|
||||||
|
Body: { url: string }
|
||||||
|
Response: {
|
||||||
|
stories: Array<{
|
||||||
|
url: string,
|
||||||
|
title: string,
|
||||||
|
author: string,
|
||||||
|
summary?: string
|
||||||
|
}>
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Core Classes
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /lib/scraper/types.ts
|
||||||
|
interface SiteConfig {
|
||||||
|
story: StorySelectors;
|
||||||
|
authorPage: AuthorPageSelectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface StorySelectors {
|
||||||
|
title: string | SelectorStrategy;
|
||||||
|
author: string | SelectorStrategy;
|
||||||
|
content: string | SelectorStrategy;
|
||||||
|
summary?: string | SelectorStrategy;
|
||||||
|
coverImage?: string | SelectorStrategy;
|
||||||
|
tags?: string | SelectorStrategy;
|
||||||
|
multiPage?: MultiPageConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SelectorStrategy {
|
||||||
|
strategy: string;
|
||||||
|
[key: string]: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ScrapedStory {
|
||||||
|
title: string;
|
||||||
|
author: string;
|
||||||
|
content: string;
|
||||||
|
summary?: string;
|
||||||
|
coverImage?: string;
|
||||||
|
tags?: string[];
|
||||||
|
sourceUrl: string;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Main Scraper Implementation
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /lib/scraper/scraper.ts
|
||||||
|
import * as cheerio from 'cheerio';
|
||||||
|
import { SiteConfig, ScrapedStory } from './types';
|
||||||
|
import sitesConfig from './config/sites.json';
|
||||||
|
|
||||||
|
export class StoryScraper {
|
||||||
|
private config: Record<string, SiteConfig>;
|
||||||
|
private cache: Map<string, any>;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.config = sitesConfig.sites;
|
||||||
|
this.cache = new Map();
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapeStory(url: string): Promise<ScrapedStory> {
|
||||||
|
const domain = this.getDomain(url);
|
||||||
|
const siteConfig = this.config[domain];
|
||||||
|
|
||||||
|
if (!siteConfig) {
|
||||||
|
throw new Error(`Unsupported site: ${domain}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await this.fetchWithCache(url);
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
const story: ScrapedStory = {
|
||||||
|
title: await this.extractField($, siteConfig.story.title, html),
|
||||||
|
author: await this.extractField($, siteConfig.story.author, html),
|
||||||
|
content: await this.extractContent($, siteConfig.story, url),
|
||||||
|
sourceUrl: url
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract optional fields
|
||||||
|
if (siteConfig.story.summary) {
|
||||||
|
story.summary = await this.extractField($, siteConfig.story.summary, html);
|
||||||
|
}
|
||||||
|
|
||||||
|
return story;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async extractField(
|
||||||
|
$: cheerio.CheerioAPI,
|
||||||
|
selector: string | SelectorStrategy,
|
||||||
|
html: string
|
||||||
|
): Promise<string> {
|
||||||
|
if (typeof selector === 'string') {
|
||||||
|
// Simple CSS selector
|
||||||
|
return $(selector).first().text().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy-based extraction
|
||||||
|
return await this.executeStrategy($, selector, html);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async executeStrategy(
|
||||||
|
$: cheerio.CheerioAPI,
|
||||||
|
strategy: SelectorStrategy,
|
||||||
|
html: string
|
||||||
|
): Promise<string> {
|
||||||
|
switch (strategy.strategy) {
|
||||||
|
case 'text-pattern':
|
||||||
|
return this.extractByTextPattern(html, strategy);
|
||||||
|
case 'link-with-path':
|
||||||
|
return this.extractLinkWithPath($, strategy);
|
||||||
|
case 'text-blocks':
|
||||||
|
return this.extractTextBlocks($, strategy);
|
||||||
|
// ... other strategies
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Strategy Implementations
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /lib/scraper/strategies/textExtractor.ts
|
||||||
|
export function extractByTextPattern(
|
||||||
|
html: string,
|
||||||
|
config: TextPatternStrategy
|
||||||
|
): string {
|
||||||
|
const regex = new RegExp(config.pattern, 'i');
|
||||||
|
const match = html.match(regex);
|
||||||
|
return match ? match[config.group || 1].trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractTextBlocks(
|
||||||
|
$: cheerio.CheerioAPI,
|
||||||
|
config: TextBlockStrategy
|
||||||
|
): string {
|
||||||
|
const blocks: Array<{element: any, text: string}> = [];
|
||||||
|
|
||||||
|
$('*').each((_, elem) => {
|
||||||
|
const $elem = $(elem);
|
||||||
|
const text = $elem.clone().children().remove().end().text().trim();
|
||||||
|
|
||||||
|
if (text.length >= (config.minLength || 500)) {
|
||||||
|
blocks.push({ element: elem, text });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find the block that likely contains story content
|
||||||
|
const storyBlock = blocks.find(block => {
|
||||||
|
const hasHints = config.containerHints?.some(hint =>
|
||||||
|
$(block.element).attr('class')?.includes(hint) ||
|
||||||
|
$(block.element).attr('id')?.includes(hint)
|
||||||
|
);
|
||||||
|
return hasHints || blocks.length === 1;
|
||||||
|
});
|
||||||
|
|
||||||
|
return storyBlock ? $(storyBlock.element).html() || '' : '';
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rate Limiting
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /lib/scraper/utils/rateLimit.ts
|
||||||
|
export class RateLimiter {
|
||||||
|
private lastRequest: number = 0;
|
||||||
|
private minDelay: number;
|
||||||
|
|
||||||
|
constructor(minDelayMs: number = 1000) {
|
||||||
|
this.minDelay = minDelayMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
async throttle(): Promise<void> {
|
||||||
|
const now = Date.now();
|
||||||
|
const timeSinceLastRequest = now - this.lastRequest;
|
||||||
|
|
||||||
|
if (timeSinceLastRequest < this.minDelay) {
|
||||||
|
await this.delay(this.minDelay - timeSinceLastRequest);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.lastRequest = Date.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
private delay(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multi-page Story Handling
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /lib/scraper/scraper.ts (addition)
|
||||||
|
private async extractContent(
|
||||||
|
$: cheerio.CheerioAPI,
|
||||||
|
storyConfig: StorySelectors,
|
||||||
|
url: string
|
||||||
|
): Promise<string> {
|
||||||
|
let content = await this.extractField($, storyConfig.content, $.html());
|
||||||
|
|
||||||
|
if (storyConfig.multiPage?.enabled) {
|
||||||
|
const additionalPages = await this.fetchAdditionalPages(
|
||||||
|
$,
|
||||||
|
url,
|
||||||
|
storyConfig.multiPage
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const pageHtml of additionalPages) {
|
||||||
|
const $page = cheerio.load(pageHtml);
|
||||||
|
const pageContent = await this.extractField(
|
||||||
|
$page,
|
||||||
|
storyConfig.content,
|
||||||
|
pageHtml
|
||||||
|
);
|
||||||
|
content += '\n\n' + pageContent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async fetchAdditionalPages(
|
||||||
|
$: cheerio.CheerioAPI,
|
||||||
|
baseUrl: string,
|
||||||
|
config: MultiPageConfig
|
||||||
|
): Promise<string[]> {
|
||||||
|
const pages: string[] = [];
|
||||||
|
let currentUrl = baseUrl;
|
||||||
|
let pageNum = 2;
|
||||||
|
|
||||||
|
while (pageNum <= (config.maxPages || 20)) {
|
||||||
|
let nextUrl: string | null = null;
|
||||||
|
|
||||||
|
if (config.strategy === 'url-pattern') {
|
||||||
|
nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
|
||||||
|
} else if (config.nextPageSelector) {
|
||||||
|
const nextLink = $(config.nextPageSelector).attr('href');
|
||||||
|
if (nextLink) {
|
||||||
|
nextUrl = new URL(nextLink, currentUrl).href;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!nextUrl) break;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.rateLimiter.throttle();
|
||||||
|
const html = await this.fetchWithCache(nextUrl);
|
||||||
|
pages.push(html);
|
||||||
|
currentUrl = nextUrl;
|
||||||
|
pageNum++;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to fetch page ${pageNum}:`, error);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// /lib/scraper/scraper.ts (addition)
|
||||||
|
async scrapeStory(url: string): Promise<ScrapedStory> {
|
||||||
|
try {
|
||||||
|
// ... existing implementation
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error) {
|
||||||
|
throw new ScraperError(
|
||||||
|
`Failed to scrape ${url}: ${error.message}`,
|
||||||
|
url,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class ScraperError extends Error {
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
public url: string,
|
||||||
|
public originalError?: Error
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = 'ScraperError';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration File Structure
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sites": {
|
||||||
|
"domain.com": {
|
||||||
|
"story": {
|
||||||
|
"title": "selector or strategy object",
|
||||||
|
"author": "selector or strategy object",
|
||||||
|
"content": "selector or strategy object",
|
||||||
|
"summary": "optional selector or strategy",
|
||||||
|
"coverImage": "optional selector or strategy",
|
||||||
|
"tags": "optional selector or strategy",
|
||||||
|
"multiPage": {
|
||||||
|
"enabled": true,
|
||||||
|
"strategy": "url-pattern|next-link",
|
||||||
|
"nextPageSelector": "a.next-page",
|
||||||
|
"pageParam": "page",
|
||||||
|
"maxPages": 20
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"authorPage": {
|
||||||
|
"storyLinks": "selector or strategy object",
|
||||||
|
"pagination": {
|
||||||
|
"enabled": true,
|
||||||
|
"nextPageSelector": "a.next"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"globalOptions": {
|
||||||
|
"userAgent": "Mozilla/5.0...",
|
||||||
|
"timeout": 30000,
|
||||||
|
"retryAttempts": 3,
|
||||||
|
"rateLimitMs": 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage Example
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// In a Next.js API route
|
||||||
|
import { StoryScraper } from '@/lib/scraper/scraper';
|
||||||
|
|
||||||
|
export async function POST(request: Request) {
|
||||||
|
const { url } = await request.json();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const scraper = new StoryScraper();
|
||||||
|
const story = await scraper.scrapeStory(url);
|
||||||
|
|
||||||
|
return Response.json(story);
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof ScraperError) {
|
||||||
|
return Response.json(
|
||||||
|
{ error: error.message },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Response.json(
|
||||||
|
{ error: 'Internal server error' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing Considerations
|
||||||
|
|
||||||
|
1. **Unit Tests**: Test individual strategies and extractors
|
||||||
|
2. **Integration Tests**: Test against saved HTML samples
|
||||||
|
3. **Mock External Requests**: Use saved HTML fixtures to avoid hitting real sites
|
||||||
|
4. **Edge Cases**: Empty content, missing fields, malformed HTML
|
||||||
|
5. **Rate Limiting**: Verify delays are properly applied
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
1. **URL Validation**: Only accept HTTP/HTTPS URLs
|
||||||
|
2. **Domain Allowlist**: Restrict to configured domains
|
||||||
|
3. **Content Sanitization**: Clean HTML before storage
|
||||||
|
4. **Request Timeouts**: Prevent hanging on slow sites
|
||||||
|
5. **Rate Limiting**: Prevent abuse of the scraping endpoint
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
1. **Browser Automation**: Use Playwright for JavaScript-rendered content
|
||||||
|
2. **AI Content Extraction**: Use LLMs for sites without clear patterns
|
||||||
|
3. **User-Submitted Configurations**: Allow users to define selectors
|
||||||
|
4. **Scheduled Imports**: Periodic author page checking
|
||||||
|
5. **Import History**: Track what has been imported to avoid duplicates
|
||||||
BIN
swissmilk_hamburger.pdf
Normal file
BIN
swissmilk_hamburger.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user