This commit is contained in:
Stefan Hardegger
2025-07-29 11:02:46 +02:00
parent c08082c0d6
commit 5746001c4a
7 changed files with 103 additions and 6 deletions

View File

@@ -57,6 +57,7 @@ export default function LibraryPage() {
tags: selectedTags.length > 0 ? selectedTags : undefined,
sortBy: sortOption,
sortDir: sortDirection,
facetBy: ['tagNames'], // Request tag facets for the filter UI
});
const currentStories = result?.results || [];

View File

@@ -314,6 +314,7 @@ export const searchApi = {
maxRating?: number;
sortBy?: string;
sortDir?: string;
facetBy?: string[];
}): Promise<SearchResult> => {
// Create URLSearchParams to properly handle array parameters
const searchParams = new URLSearchParams();
@@ -334,6 +335,9 @@ export const searchApi = {
if (params.tags && params.tags.length > 0) {
params.tags.forEach(tag => searchParams.append('tags', tag));
}
if (params.facetBy && params.facetBy.length > 0) {
params.facetBy.forEach(facet => searchParams.append('facetBy', facet));
}
const response = await api.get(`/stories/search?${searchParams.toString()}`);
return response.data;

View File

@@ -12,7 +12,7 @@
"searchBefore": "</title>"
},
"content": {
"strategy": "text-blocks",
"strategy": "deviantart-content",
"minLength": 200,
"containerHints": ["journal", "literature", "story", "text", "content"],
"excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"]

View File

@@ -17,6 +17,7 @@ import { UrlParser } from './utils/urlParser';
import {
extractByTextPattern,
extractTextBlocks,
extractDeviantArtContent,
extractHtmlBetween,
extractLinkText,
extractLinkWithPath,
@@ -246,6 +247,8 @@ export class StoryScraper {
return extractLinkWithPath($, strategy as any);
case 'text-blocks':
return extractTextBlocks($, strategy as any);
case 'deviantart-content':
return extractDeviantArtContent($, strategy as any);
case 'href-pattern':
return extractHrefPattern($, strategy as any);
case 'html-between':

View File

@@ -82,6 +82,58 @@ export function extractTextBlocks(
return largestBlock ? $(largestBlock.element).html() || '' : '';
}
export function extractDeviantArtContent(
$: cheerio.CheerioAPI,
config: TextBlockStrategy
): string {
// Remove excluded elements first
if (config.excludeSelectors) {
config.excludeSelectors.forEach(selector => {
$(selector).remove();
});
}
// DeviantArt has two main content structures:
// 1. Old format: <div class="text"> containing the full story
// 2. New format: <div class="_83r8m _2CKTq"> or similar classes containing multiple <p> elements
// Try the old format first (single text div)
const textDiv = $('.text');
if (textDiv.length > 0 && textDiv.text().trim().length >= (config.minLength || 200)) {
return textDiv.html() || '';
}
// Try the new format (multiple paragraphs in specific containers)
const newFormatSelectors = [
'div[class*="_83r8m"] p', // Main story content container
'div[class*="_2CKTq"] p', // Alternate story content container
'div[class*="journal"] p' // Generic journal container
];
for (const selector of newFormatSelectors) {
const paragraphs = $(selector);
if (paragraphs.length > 0) {
let totalText = '';
paragraphs.each((_, p) => {
totalText += $(p).text().trim();
});
// Check if this container has enough content
if (totalText.length >= (config.minLength || 200)) {
// Combine all paragraphs into a single HTML string
let combinedHtml = '';
paragraphs.each((_, p) => {
combinedHtml += $(p).prop('outerHTML') || '';
});
return combinedHtml;
}
}
}
// Fallback to the original text-blocks strategy
return extractTextBlocks($, config);
}
export function extractHtmlBetween(
html: string,
config: HtmlBetweenStrategy