Small improvements

This commit is contained in:
Stefan Hardegger
2025-08-15 07:58:36 +02:00
parent 460ec358ca
commit 6b83783381
3 changed files with 31 additions and 2 deletions

View File

@@ -78,6 +78,10 @@ export function extractResponsiveImage(
return { url, width };
});
if (sources.length === 0) {
return img.attr('src') || '';
}
const largest = sources.reduce((prev: any, current: any) =>
prev.width > current.width ? prev : current
);

View File

@@ -75,6 +75,10 @@ export function extractTextBlocks(
}
// Fallback to largest block
if (blocks.length === 0) {
return '';
}
const largestBlock = blocks.reduce((prev, current) =>
prev.text.length > current.text.length ? prev : current
);
@@ -86,6 +90,20 @@ export function extractDeviantArtContent(
$: cheerio.CheerioAPI,
config: TextBlockStrategy
): string {
// Check for mature content warning or login requirement
const matureWarning = $('.deviation-overlay.blocked.mature, .mature-filter, .ismature').first();
if (matureWarning.length > 0) {
throw new Error('Content is restricted by mature content filter. Login may be required to access this story.');
}
const loginRequired = $('a[href*="join"][href*="mature"], a[href*="login"]').filter((_, elem) => {
const text = $(elem).text().toLowerCase();
return text.includes('log in') || text.includes('sign up');
});
if (loginRequired.length > 0) {
throw new Error('Login is required to access this DeviantArt content.');
}
// Remove excluded elements first
if (config.excludeSelectors) {
config.excludeSelectors.forEach(selector => {
@@ -93,9 +111,10 @@ export function extractDeviantArtContent(
});
}
// DeviantArt has two main content structures:
// DeviantArt has multiple content structures:
// 1. Old format: <div class="text"> containing the full story
// 2. New format: <div class="_83r8m _2CKTq"> or similar classes containing multiple <p> elements
// 3. Legacy journal format: .legacy-journal .text
// Try the old format first (single text div)
const textDiv = $('.text');
@@ -103,6 +122,12 @@ export function extractDeviantArtContent(
return textDiv.html() || '';
}
// Try legacy journal format
const legacyJournal = $('.legacy-journal .text, .legacy-journal .journal-text');
if (legacyJournal.length > 0 && legacyJournal.text().trim().length >= (config.minLength || 200)) {
return legacyJournal.html() || '';
}
// Try the new format (multiple paragraphs in specific containers)
const newFormatSelectors = [
'div[class*="_83r8m"] p', // Main story content container