Bugfixes
This commit is contained in:
@@ -314,6 +314,7 @@ export const searchApi = {
|
||||
maxRating?: number;
|
||||
sortBy?: string;
|
||||
sortDir?: string;
|
||||
facetBy?: string[];
|
||||
}): Promise<SearchResult> => {
|
||||
// Create URLSearchParams to properly handle array parameters
|
||||
const searchParams = new URLSearchParams();
|
||||
@@ -334,6 +335,9 @@ export const searchApi = {
|
||||
if (params.tags && params.tags.length > 0) {
|
||||
params.tags.forEach(tag => searchParams.append('tags', tag));
|
||||
}
|
||||
if (params.facetBy && params.facetBy.length > 0) {
|
||||
params.facetBy.forEach(facet => searchParams.append('facetBy', facet));
|
||||
}
|
||||
|
||||
const response = await api.get(`/stories/search?${searchParams.toString()}`);
|
||||
return response.data;
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"searchBefore": "</title>"
|
||||
},
|
||||
"content": {
|
||||
"strategy": "text-blocks",
|
||||
"strategy": "deviantart-content",
|
||||
"minLength": 200,
|
||||
"containerHints": ["journal", "literature", "story", "text", "content"],
|
||||
"excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"]
|
||||
|
||||
@@ -17,6 +17,7 @@ import { UrlParser } from './utils/urlParser';
|
||||
import {
|
||||
extractByTextPattern,
|
||||
extractTextBlocks,
|
||||
extractDeviantArtContent,
|
||||
extractHtmlBetween,
|
||||
extractLinkText,
|
||||
extractLinkWithPath,
|
||||
@@ -246,6 +247,8 @@ export class StoryScraper {
|
||||
return extractLinkWithPath($, strategy as any);
|
||||
case 'text-blocks':
|
||||
return extractTextBlocks($, strategy as any);
|
||||
case 'deviantart-content':
|
||||
return extractDeviantArtContent($, strategy as any);
|
||||
case 'href-pattern':
|
||||
return extractHrefPattern($, strategy as any);
|
||||
case 'html-between':
|
||||
|
||||
@@ -82,6 +82,58 @@ export function extractTextBlocks(
|
||||
return largestBlock ? $(largestBlock.element).html() || '' : '';
|
||||
}
|
||||
|
||||
export function extractDeviantArtContent(
|
||||
$: cheerio.CheerioAPI,
|
||||
config: TextBlockStrategy
|
||||
): string {
|
||||
// Remove excluded elements first
|
||||
if (config.excludeSelectors) {
|
||||
config.excludeSelectors.forEach(selector => {
|
||||
$(selector).remove();
|
||||
});
|
||||
}
|
||||
|
||||
// DeviantArt has two main content structures:
|
||||
// 1. Old format: <div class="text"> containing the full story
|
||||
// 2. New format: <div class="_83r8m _2CKTq"> or similar classes containing multiple <p> elements
|
||||
|
||||
// Try the old format first (single text div)
|
||||
const textDiv = $('.text');
|
||||
if (textDiv.length > 0 && textDiv.text().trim().length >= (config.minLength || 200)) {
|
||||
return textDiv.html() || '';
|
||||
}
|
||||
|
||||
// Try the new format (multiple paragraphs in specific containers)
|
||||
const newFormatSelectors = [
|
||||
'div[class*="_83r8m"] p', // Main story content container
|
||||
'div[class*="_2CKTq"] p', // Alternate story content container
|
||||
'div[class*="journal"] p' // Generic journal container
|
||||
];
|
||||
|
||||
for (const selector of newFormatSelectors) {
|
||||
const paragraphs = $(selector);
|
||||
if (paragraphs.length > 0) {
|
||||
let totalText = '';
|
||||
paragraphs.each((_, p) => {
|
||||
totalText += $(p).text().trim();
|
||||
});
|
||||
|
||||
// Check if this container has enough content
|
||||
if (totalText.length >= (config.minLength || 200)) {
|
||||
// Combine all paragraphs into a single HTML string
|
||||
let combinedHtml = '';
|
||||
paragraphs.each((_, p) => {
|
||||
combinedHtml += $(p).prop('outerHTML') || '';
|
||||
});
|
||||
return combinedHtml;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to the original text-blocks strategy
|
||||
return extractTextBlocks($, config);
|
||||
}
|
||||
|
||||
export function extractHtmlBetween(
|
||||
html: string,
|
||||
config: HtmlBetweenStrategy
|
||||
|
||||
Reference in New Issue
Block a user