This commit is contained in:
Stefan Hardegger
2025-07-29 11:02:46 +02:00
parent c08082c0d6
commit 5746001c4a
7 changed files with 103 additions and 6 deletions

View File

@@ -77,7 +77,7 @@ public class TypesenseService {
new Field().name("authorName").type("string").facet(true).sort(true), new Field().name("authorName").type("string").facet(true).sort(true),
new Field().name("seriesId").type("string").facet(true).optional(true), new Field().name("seriesId").type("string").facet(true).optional(true),
new Field().name("seriesName").type("string").facet(true).sort(true).optional(true), new Field().name("seriesName").type("string").facet(true).sort(true).optional(true),
new Field().name("tagNames").type("string[]").facet(true).optional(true), new Field().name("tagNames").type("string[]").facet(true),
new Field().name("rating").type("int32").facet(true).sort(true).optional(true), new Field().name("rating").type("int32").facet(true).sort(true).optional(true),
new Field().name("wordCount").type("int32").facet(true).sort(true).optional(true), new Field().name("wordCount").type("int32").facet(true).sort(true).optional(true),
new Field().name("volume").type("int32").facet(true).sort(true).optional(true), new Field().name("volume").type("int32").facet(true).sort(true).optional(true),
@@ -232,6 +232,9 @@ public class TypesenseService {
.maxFacetValues(100) .maxFacetValues(100)
.sortBy(buildSortParameter(normalizedQuery, sortBy, sortDir)); .sortBy(buildSortParameter(normalizedQuery, sortBy, sortDir));
logger.debug("Typesense search parameters - facetBy: {}, maxFacetValues: {}",
searchParameters.getFacetBy(), searchParameters.getMaxFacetValues());
// Add filters // Add filters
List<String> filterConditions = new ArrayList<>(); List<String> filterConditions = new ArrayList<>();
@@ -269,6 +272,7 @@ public class TypesenseService {
.documents() .documents()
.search(searchParameters); .search(searchParameters);
logger.debug("Search result facet counts: {}", searchResult.getFacetCounts());
List<StorySearchDto> results = convertSearchResult(searchResult); List<StorySearchDto> results = convertSearchResult(searchResult);
Map<String, List<FacetCountDto>> facets = processFacetCounts(searchResult); Map<String, List<FacetCountDto>> facets = processFacetCounts(searchResult);
@@ -375,7 +379,10 @@ public class TypesenseService {
.map(tag -> tag.getName()) .map(tag -> tag.getName())
.collect(Collectors.toList()); .collect(Collectors.toList());
document.put("tagNames", tagNames); document.put("tagNames", tagNames);
logger.debug("Story '{}' has {} tags: {}", story.getTitle(), tagNames.size(), tagNames);
} else { } else {
document.put("tagNames", new ArrayList<>());
logger.debug("Story '{}' has no tags, setting empty array", story.getTitle());
} }
document.put("rating", story.getRating() != null ? story.getRating() : 0); document.put("rating", story.getRating() != null ? story.getRating() : 0);
@@ -406,15 +413,34 @@ public class TypesenseService {
List<FacetCountDto> facetValues = new ArrayList<>(); List<FacetCountDto> facetValues = new ArrayList<>();
if (facetCounts.getCounts() != null) { if (facetCounts.getCounts() != null) {
for (Object countObj : facetCounts.getCounts()) { for (Object countObj : facetCounts.getCounts()) {
if (countObj instanceof Map) { if (countObj instanceof org.typesense.model.FacetCountsCounts) {
Map<String, Object> countMap = (Map<String, Object>) countObj; org.typesense.model.FacetCountsCounts facetCount = (org.typesense.model.FacetCountsCounts) countObj;
String value = (String) countMap.get("value"); String value = facetCount.getValue();
Integer count = (Integer) countMap.get("count"); Integer count = facetCount.getCount();
if (value != null && count != null && count > 0) { if (value != null && count != null && count > 0) {
facetValues.add(new FacetCountDto(value, count)); facetValues.add(new FacetCountDto(value, count));
} }
} else if (countObj instanceof Map) {
// Fallback for Map-based responses
Map<String, Object> countMap = (Map<String, Object>) countObj;
String value = (String) countMap.get("value");
Object countValue = countMap.get("count");
if (value != null && countValue != null) {
Integer count = null;
if (countValue instanceof Integer) {
count = (Integer) countValue;
} else if (countValue instanceof Number) {
count = ((Number) countValue).intValue();
}
if (count != null && count > 0) {
facetValues.add(new FacetCountDto(value, count));
}
}
} }
} }
} }
@@ -432,6 +458,12 @@ public class TypesenseService {
} }
} }
// DEBUG: Log final facet processing results
logger.info("FACET DEBUG: Final facetMap contents: {}", facetMap);
if (facetMap.isEmpty()) {
logger.info("FACET DEBUG: No facets were processed - investigating why");
}
return facetMap; return facetMap;
} }

View File

@@ -57,6 +57,7 @@ export default function LibraryPage() {
tags: selectedTags.length > 0 ? selectedTags : undefined, tags: selectedTags.length > 0 ? selectedTags : undefined,
sortBy: sortOption, sortBy: sortOption,
sortDir: sortDirection, sortDir: sortDirection,
facetBy: ['tagNames'], // Request tag facets for the filter UI
}); });
const currentStories = result?.results || []; const currentStories = result?.results || [];

View File

@@ -314,6 +314,7 @@ export const searchApi = {
maxRating?: number; maxRating?: number;
sortBy?: string; sortBy?: string;
sortDir?: string; sortDir?: string;
facetBy?: string[];
}): Promise<SearchResult> => { }): Promise<SearchResult> => {
// Create URLSearchParams to properly handle array parameters // Create URLSearchParams to properly handle array parameters
const searchParams = new URLSearchParams(); const searchParams = new URLSearchParams();
@@ -334,6 +335,9 @@ export const searchApi = {
if (params.tags && params.tags.length > 0) { if (params.tags && params.tags.length > 0) {
params.tags.forEach(tag => searchParams.append('tags', tag)); params.tags.forEach(tag => searchParams.append('tags', tag));
} }
if (params.facetBy && params.facetBy.length > 0) {
params.facetBy.forEach(facet => searchParams.append('facetBy', facet));
}
const response = await api.get(`/stories/search?${searchParams.toString()}`); const response = await api.get(`/stories/search?${searchParams.toString()}`);
return response.data; return response.data;

View File

@@ -12,7 +12,7 @@
"searchBefore": "</title>" "searchBefore": "</title>"
}, },
"content": { "content": {
"strategy": "text-blocks", "strategy": "deviantart-content",
"minLength": 200, "minLength": 200,
"containerHints": ["journal", "literature", "story", "text", "content"], "containerHints": ["journal", "literature", "story", "text", "content"],
"excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"] "excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"]

View File

@@ -17,6 +17,7 @@ import { UrlParser } from './utils/urlParser';
import { import {
extractByTextPattern, extractByTextPattern,
extractTextBlocks, extractTextBlocks,
extractDeviantArtContent,
extractHtmlBetween, extractHtmlBetween,
extractLinkText, extractLinkText,
extractLinkWithPath, extractLinkWithPath,
@@ -246,6 +247,8 @@ export class StoryScraper {
return extractLinkWithPath($, strategy as any); return extractLinkWithPath($, strategy as any);
case 'text-blocks': case 'text-blocks':
return extractTextBlocks($, strategy as any); return extractTextBlocks($, strategy as any);
case 'deviantart-content':
return extractDeviantArtContent($, strategy as any);
case 'href-pattern': case 'href-pattern':
return extractHrefPattern($, strategy as any); return extractHrefPattern($, strategy as any);
case 'html-between': case 'html-between':

View File

@@ -82,6 +82,58 @@ export function extractTextBlocks(
return largestBlock ? $(largestBlock.element).html() || '' : ''; return largestBlock ? $(largestBlock.element).html() || '' : '';
} }
export function extractDeviantArtContent(
$: cheerio.CheerioAPI,
config: TextBlockStrategy
): string {
// Remove excluded elements first
if (config.excludeSelectors) {
config.excludeSelectors.forEach(selector => {
$(selector).remove();
});
}
// DeviantArt has two main content structures:
// 1. Old format: <div class="text"> containing the full story
// 2. New format: <div class="_83r8m _2CKTq"> or similar classes containing multiple <p> elements
// Try the old format first (single text div)
const textDiv = $('.text');
if (textDiv.length > 0 && textDiv.text().trim().length >= (config.minLength || 200)) {
return textDiv.html() || '';
}
// Try the new format (multiple paragraphs in specific containers)
const newFormatSelectors = [
'div[class*="_83r8m"] p', // Main story content container
'div[class*="_2CKTq"] p', // Alternate story content container
'div[class*="journal"] p' // Generic journal container
];
for (const selector of newFormatSelectors) {
const paragraphs = $(selector);
if (paragraphs.length > 0) {
let totalText = '';
paragraphs.each((_, p) => {
totalText += $(p).text().trim();
});
// Check if this container has enough content
if (totalText.length >= (config.minLength || 200)) {
// Combine all paragraphs into a single HTML string
let combinedHtml = '';
paragraphs.each((_, p) => {
combinedHtml += $(p).prop('outerHTML') || '';
});
return combinedHtml;
}
}
}
// Fallback to the original text-blocks strategy
return extractTextBlocks($, config);
}
export function extractHtmlBetween( export function extractHtmlBetween(
html: string, html: string,
config: HtmlBetweenStrategy config: HtmlBetweenStrategy

5
package.json Normal file
View File

@@ -0,0 +1,5 @@
{
"dependencies": {
"cheerio": "^1.1.2"
}
}